Merge pull request #5066 from OpenMathLib/develop

Merge changes from develop in preparation of the 0.3.29 release
1 year ago · 9207052d85
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -89,20 +89,13 @@ task:
      type: text/plain

 macos_instance:
  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
  image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
 task:
  name: AppleM1/LLVM armv7-androidndk xbuild
  compile_script:
  - brew install android-ndk
  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
  - ls /System/Volumes/Data/opt/homebrew
  - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
  - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
  - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
  - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
  - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
  - brew install --cask android-ndk
  - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
  - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
  - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
  always:
    config_artifacts:
@@ -132,9 +125,9 @@ task:
  - make USE_OPENMP=1

 FreeBSD_task:
  name: FreeBSD-gcc12
  name: FreeBSD-gcc
  freebsd_instance:
    image_family: freebsd-13-3
    image_family: freebsd-14-1
  install_script:
  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
  compile_script:
@@ -143,9 +136,9 @@ FreeBSD_task:


 FreeBSD_task:
  name: freebsd-gcc12-ilp64
  name: freebsd-gcc-ilp64
  freebsd_instance:
    image_family: freebsd-13-3
    image_family: freebsd-14-1
  install_script:
  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
  compile_script:
@@ -155,7 +148,7 @@ FreeBSD_task:
 FreeBSD_task:
  name: FreeBSD-clang-openmp
  freebsd_instance:
    image_family: freebsd-13-3
    image_family: freebsd-14-1
  install_script:
  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
  - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,7 +23,7 @@ jobs:
          python-version: "3.10"

      - name: Install MkDocs and doc theme packages
        run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin
        run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin mkdocs-mermaid2-plugin

      - name: Build docs site
        run: mkdocs build
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -158,7 +158,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
        msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
        msystem: [UCRT64, MINGW32, CLANG64]
        idx: [int32, int64]
        build-type: [Release]
        include:
@@ -174,14 +174,6 @@ jobs:
            idx: int32
            target-prefix: mingw-w64-clang-x86_64
            fc-pkg: fc
            # Compiling with Flang 16 seems to cause test errors on machines
            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
            no-avx512-flags: -DNO_AVX512=1
          - msystem: CLANG32
            idx: int32
            target-prefix: mingw-w64-clang-i686
            fc-pkg: cc
            c-lapack-flags: -DC_LAPACK=ON
          - msystem: UCRT64
            idx: int64
            idx64-flags: -DBINARY=64 -DINTERFACE64=1
@@ -192,9 +184,6 @@ jobs:
            idx64-flags: -DBINARY=64 -DINTERFACE64=1
            target-prefix: mingw-w64-clang-x86_64
            fc-pkg: fc
            # Compiling with Flang 16 seems to cause test errors on machines
            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
            no-avx512-flags: -DNO_AVX512=1
          - msystem: UCRT64
            idx: int32
            target-prefix: mingw-w64-ucrt-x86_64
@@ -203,8 +192,6 @@ jobs:
        exclude:
          - msystem: MINGW32
            idx: int64
          - msystem: CLANG32
            idx: int64

    defaults:
      run:
@@ -280,8 +267,6 @@ jobs:
                -DNUM_THREADS=64 \
                -DTARGET=CORE2 \
                ${{ matrix.idx64-flags }} \
                ${{ matrix.c-lapack-flags }} \
                ${{ matrix.no-avx512-flags }} \
                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
                -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
                ..
--- a/.github/workflows/harmonyos.yml
+++ b/.github/workflows/harmonyos.yml
@@ -0,0 +1,37 @@
 name: harmonyos

 on: [push, pull_request]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 permissions:
  contents: read # to fetch code (actions/checkout)

 jobs:
  build:
    if: "github.repository == 'OpenMathLib/OpenBLAS'"
    runs-on: ubuntu-latest
    env:
      OHOS_NDK_CMAKE: $GITHUB_WORKSPACE/ohos-sdk/linux/native/build-tools/cmake/bin/cmake
      COMMON_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
    steps:
    - uses: actions/checkout@v4
    - name: ndk-install
      run: |
        wget https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz
        tar -xf ohos-sdk-windows_linux-public.tar.gz
        cd ohos-sdk/linux
        unzip -q native-linux-x64-4.1.7.8-Release.zip
        cd -
    - name: build-armv8
      run: |
       mkdir build && cd build
       ${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \
       -DTARGET=ARMV8 -DNOFORTRAN=1 ..
       ${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc)
       
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -9,22 +9,31 @@ concurrency:
 jobs:
  TEST:
    if: "github.repository == 'OpenMathLib/OpenBLAS'"
    runs-on: ubuntu-latest
    runs-on: ubuntu-24.04
    strategy:
      fail-fast: false
      matrix:
        include:
          - target: LOONGSONGENERIC
            triple:  loongarch64-unknown-linux-gnu
            triple:  loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
          - target: LOONGSON3R5
            triple: loongarch64-unknown-linux-gnu
            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
          - target: LOONGSON2K1000
            triple: loongarch64-unknown-linux-gnu
            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
          - target: LA64_GENERIC
            triple:  loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
          - target: LA464
            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
          - target: LA264
            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
          - target: DYNAMIC_ARCH
            triple: loongarch64-unknown-linux-gnu
            triple: loongarch64-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC

    steps:
@@ -33,33 +42,9 @@ jobs:

      - name: Install APT deps
        run: |
          sudo apt-get update
          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache

      - name: Download and install loongarch64-toolchain
        run: |
          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt

      - name: Checkout qemu
        uses: actions/checkout@v3
        with:
          repository: qemu/qemu
          path: qemu
          ref: master

      - name: Install qemu
        run: |
          cd qemu
          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static
          make -j$(nproc)
          make install

      - name: Set env
        run: |
          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
          sudo apt-get update && \
          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache qemu-user-static \
                               gcc-14-loongarch64-linux-gnu g++-14-loongarch64-linux-gnu gfortran-14-loongarch64-linux-gnu

      - name: Compilation cache
        uses: actions/cache@v3
@@ -80,54 +65,55 @@ jobs:
      - name: Disable utest dsdot:dsdot_n_1
        run: |
          echo -n > utest/test_dsdot.c
          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
          echo "Due to the current version of qemu causing utest cases to fail,"
          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."

      - name: Build OpenBLAS
        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
        run: |
          make CC='ccache ${{ matrix.triple }}-gcc-14 -static' FC='ccache ${{ matrix.triple }}-gfortran-14 -static' \
          RANLIB='ccache ${{ matrix.triple }}-gcc-ranlib-14' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)

      - name: Test
        run: |
          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
          qemu-loongarch64 ./utest/openblas_utest
          qemu-loongarch64 ./utest/openblas_utest_ext
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1
          qemu-loongarch64-static ./utest/openblas_utest
          qemu-loongarch64-static ./utest/openblas_utest_ext
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
          rm -f ./test/?BLAT2.SUMM
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
          rm -f ./test/?BLAT2.SUMM
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
          rm -f ./test/?BLAT3.SUMM
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
          rm -f ./test/?BLAT3.SUMM
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
--- a/.github/workflows/loongarch64_clang.yml
+++ b/.github/workflows/loongarch64_clang.yml
@@ -20,6 +20,12 @@ jobs:
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
          - target: LOONGSON2K1000
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
          - target: LA64_GENERIC
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
          - target: LA464
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
          - target: LA264
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
          - target: DYNAMIC_ARCH
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC

--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -69,7 +69,7 @@ jobs:
          mv *.bottle.tar.gz bottles

      - name: Upload bottle
        uses: actions/upload-artifact@v1
        uses: actions/upload-artifact@v4
        with:
          name: openblas--HEAD.catalina.bottle.tar.gz
          path: bottles
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,13 +2,14 @@
 ## Author: Hank Anderson <hank@statease.com>
 ##

 cmake_minimum_required(VERSION 2.8.5)
 cmake_minimum_required(VERSION 3.16.0)

 set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S")
 project(OpenBLAS C ASM)

 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
 set(OpenBLAS_PATCH_VERSION 28)
 set(OpenBLAS_PATCH_VERSION 28.dev)

 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

@@ -102,6 +103,10 @@ endif()

 message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")

 if (USE_OPENMP)
  find_package(OpenMP REQUIRED)
 endif ()

 include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
 include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")

@@ -258,8 +263,13 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
  endif()
 endif()

 if (APPLE AND BUILD_SHARED_LIBS)
 set(CMAKE_MACOSX_RPATH ON)
 if (USE_OPENMP)
  if(BUILD_STATIC_LIBS)
    target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
  endif()
  if(BUILD_SHARED_LIBS)
    target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
  endif()
 endif()

 # Seems that this hack doesn't required since macOS 11 Big Sur
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -226,3 +226,9 @@ In chronological order:

 * Dirreke <https://github.com/mseminatore>
  * [2024-01-16] Add basic support for the CSKY architecture

 * Christopher Daley <https://github.com/cdaley>
  * [2024-01-24] Optimize GEMV forwarding on ARM64 systems

 * Aniket P. Garade <https://github.com/garadeaniket>   Sushil Pratap Singh <https://github.com/SushilPratap04>  Juliya James <https://github.com/Juliya32> 
  *  [2024-12-13] Optimized swap and rot  Level-1 BLAS routines with ARM SVE  
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,99 @@
 OpenBLAS ChangeLog
 ====================================================================
 Version 0.3.29
 12-Jan-2025

 general:
 - fixed a potential NULL pointer dereference in multithreaded builds
 - added function aliases for GEMMT using its new name GEMMTR adopted by Reference-BLAS
 - fixed a build failure when building without LAPACK_DEPRECATED functions
 - the minimum required CMake version for CMake-based builds was raised to 3.16.0 in order 
   to remove many compatibility and deprecation warnings
 - added more detailed CMake rules for OpenMP builds (mainly to support recent LLVM)
 - fixed the behavior of the recently added CBLAS_?GEMMT functions with row-major data
 - improved thread scaling of multithreaded SBGEMV 
 - improved thread scaling of multithreaded TRTRI 
 - fixed compilation of the CBLAS testsuite with gcc14 (and no Fortran compiler)
 - added support for option handling changes in flang-new from LLVM18 onwards 
 - added support for recent calling conventions changes in Cray and NVIDIA compilers
 - added support for compilation with the NAG Fortran compiler
 - fixed placement of the -fopenmp flag and libsuffix in the generated pkgconfig file
 - improved the CMakeConfig file generated by the Makefile build
 - fixed const-correctness of cblas_?geadd in cblas.h
 - fixed a potential inaccuracy in multithreaded BLAS3 calls
 - fixed empty implementations of get/set_affinity that print a warning in OpenMP builds
 - fixed function signatures for TRTRS in the converted C version of LAPACK
 - fixed omission of several single-precision LAPACK symbols in the shared library 
 - improved build instructions for the provided "pybench" benchmarks
 - improved documentation, including added build instructions for WoA and HarmonyOS
   as well as descriptions of environment variables that affect build and runtime behavior
 - added a separate "make install_tests" target for use with cross-compilations
 - integrated improvements and corrections from Reference-LAPACK:
   - removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062)
   - fixed the leading dimension for B in tests for GGEV (LAPACK PR 1064)
   - replaced the ?LARFT functions with a recursive implementation (LAPACK PR 1080)

 arm:
 - fixed build with recent versions of the NDK (missing .type declaration of symbols)

 arm64:
 - fixed a long-standing bug in the (generic) c/zgemm_beta kernel that could lead to
   reads and writes outside the array bounds in some circumstances
 - rewrote cpu autodetection to scan all cores and return the highest performing type
 - improved the DGEMM performance for SVE targets and small matrix sizes
 - improved dimension criteria for forwarding from GEMM to GEMV kernels
 - added SVE kernels for ROT and SWAP
 - improved SVE kernels for SGEMV and DGEMV on A64FX and NEOVERSEV1
 - added support for using the "small matrix" kernels with CMake as well
 - fixed compilation on Windows on Arm
 - improved compile-time detection of SVE capability
 - added cpu autodetection and initial support for Apple M4
 - added support for compilation on systems running IOS
 - added support for compilation on NetBSD ("evbarm" architecture)
 - fixed NRM2 implementations for generic SVE targets and the Neoverse N2
 - fixed compilation for SVE-capable targets with the NVIDIA compiler

 x86_64:
 - fixed a wrong storage size in the SBGEMV kernel for Cooper Lake
 - added cpu autodetection for Intel Granite Rapids
 - added cpu autodetection for AMD Ryzen 5 series
 - added optimized SOMATCOPY_CT for AVX-capable targets
 - fixed the fallback implementation of GEMM3M in GENERIC builds
 - tentatively re-enabled builds with the EXPRECISION option
 - worked around a miscompilation of tests with mingw32-gfortran14
 - added support for compilation with the Intel oneAPI 2025.0 compiler on Windows

 power:
 - fixed multithreaded SBGEMM
 - fixed a CMake build problem on POWER10
 - improved the performance of SGEMV
 - added vectorized implementations of SBGEMV and support for forwarding 1xN SBGEMM to them
 - fixed illegal instructions and potential memory overflow in SGEMM on PPCG4
 - fixed handling of NaN and Inf arguments in SSCAL and DSCAL on PPC440,G4 and 970
 - added improved CGEMM and ZGEMM kernels for POWER10
 - added Makefile logic to remove all optimization flags in DEBUG builds

 mips64:
 - fixed compilation with gcc14
 - fixed GEMM parameter selection for the MIPS64_GENERIC target
 - fixed a potential build failure when compiling with OpenMP

 loongarch64:
 - fixed compilation for Loongson3 with recent versions of gmake
 - fixed a potential loss of precision in Loongson3A GEMM
 - fixed a potential build failure when compiling with OpenMP
 - added optimized SOMATCOPY for LASX-capable targets
 - introduced a new cpu naming scheme while retaining compatibility
 - added support for cross-compiling Loongarch64 targets with CMake
 - added support for compilation with LLVM

 riscv64:
 - removed thread yielding overhead caused by sched_yield
 - replaced some non-standard intrinsics with their official names
 - fixed and sped up the implementations of CGEMM/ZGEMM TCOPY for vector lenghts 128 and 256
 - improved the performance of SNRM2/DNRM2 for RVV1.0 targets
 - added optimized ?OMATCOPY_CN kernels for RVV1.0 targets

 ====================================================================
 Version 0.3.28
 8-Aug-2024
--- a/+ 3
+++ b/+ 3
@@ -426,6 +426,9 @@ dummy :
 install :
 	$(MAKE) -f Makefile.install install

 install_tests :
 	$(MAKE) -f Makefile.install install_tests

 clean ::
 	@for d in $(SUBDIRS_ALL) ; \
 	do if test -d $$d; then \
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -351,4 +351,31 @@ endif

 endif

 else
 # NVIDIA HPC options necessary to enable SVE in the compiler
 ifeq ($(CORE), THUNDERX2T99)
 CCOMMON_OPT += -tp=thunderx2t99
 FCOMMON_OPT += -tp=thunderx2t99
 endif
 ifeq ($(CORE), NEOVERSEN1)
 CCOMMON_OPT += -tp=neoverse-n1
 FCOMMON_OPT += -tp=neoverse-n1
 endif
 ifeq ($(CORE), NEOVERSEV1)
 CCOMMON_OPT += -tp=neoverse-v1
 FCOMMON_OPT += -tp=neoverse-v1
 endif
 ifeq ($(CORE), NEOVERSEV2)
 CCOMMON_OPT += -tp=neoverse-v2
 FCOMMON_OPT += -tp=neoverse-v2
 endif
 ifeq ($(CORE), ARMV8SVE)
 CCOMMON_OPT += -tp=neoverse-v2
 FCOMMON_OPT += -tp=neoverse-v2
 endif
 ifeq ($(CORE), ARMV9SVE)
 CCOMMON_OPT += -tp=neoverse-v2
 FCOMMON_OPT += -tp=neoverse-v2
 endif

 endif
--- a/Makefile.install
+++ b/Makefile.install
@@ -14,6 +14,9 @@ endif
 ifeq ($(INTERFACE64),1)
 USE_64BITINT=1
 endif
 ifeq ($(USE_OPENMP),1)
 	FOMP_OPT:= -fopenmp
 endif

 PREFIX ?= /opt/OpenBLAS

@@ -178,6 +181,7 @@ endif
 	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
 	@echo 'omp_opt='$(FOMP_OPT) >> "$(PKGFILE)"
 	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
 	@echo 'version='$(VERSION) >> "$(PKGFILE)"
 	@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
@@ -187,22 +191,29 @@ endif
 #Generating OpenBLASConfig.cmake
 	@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
 	@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), Darwin)
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 	@echo "add_library(OpenBLAS::OpenBLAS SHARED IMPORTED)"
 	@echo "target_include_directories(OpenBLAS::OpenBLAS INTERFACE \$${OpenBLAS_INCLUDE_DIRS})"
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_LOCATION \$${OpenBLAS_LIBRARIES})"
 	@echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_IMPLIB \$${_OpenBLAS_ROOT_DIR}/lib/libopenblas.lib)"
 endif
 else
 #only static
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 	@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 #Generating OpenBLASConfigVersion.cmake
 	@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@@ -216,3 +227,96 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!

 install_tests : lib.grd
 ifneq ($(ONLY_CBLAS), 1)
 	@install -m 666 utest/openblas_utest $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 utest/openblas_utest_ext $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 ifndef NO_FBLAS
 ifeq ($(BUILD_BFLOAT16),1)
 	@install -m 666 test/test_sbgemm $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_SINGLE),1)
 	@install -m 666 test/sblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/sblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_DOUBLE),1)
 	@install -m 666 test/dblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/dblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_COMPLEX),1)
 	@install -m 666 test/cblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 test/cblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/cblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 	@install -m 666 test/zblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 test/zblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 test/zblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif
 endif
 endif
 ifneq ($(ONLY_CBLAS), 1)
 ifeq ($(BUILD_SINGLE),1)
 	@install -m 666 ctest/xscblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xscblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xscblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/sin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/sin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_DOUBLE),1)
 	@install -m 666 ctest/xdcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xdcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xdcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/din2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/din3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 ifeq ($(BUILD_COMPLEX),1)
 	@install -m 666 ctest/xccblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xccblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xccblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/cin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/cin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 ctest/xccblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/cin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 	@install -m 666 ctest/xzcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xzcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/xzcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/zin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/zin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS))
 	@install -m 666 ctest/xzcblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 	@install -m 666 ctest/zin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif

 endif
 ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR)
 endif
 endif

--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
 ifeq ($(CORE), x280)
 CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
 CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d 
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL256B)
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #

 # This library's version
 VERSION = 0.3.28
 VERSION = 0.3.28.dev

 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
--- a/Makefile.system
+++ b/Makefile.system
@@ -282,15 +282,19 @@ GEMM_GEMV_FORWARD = 1
 endif
 ifeq ($(ARCH), power)
 GEMM_GEMV_FORWARD = 1
 GEMM_GEMV_FORWARD_BF16 = 1
 endif

 ifeq ($(SMALL_MATRIX_OPT), 1)
 CCOMMON_OPT += -DSMALL_MATRIX_OPT
 endif
 ifeq ($(GEMM_GEMV_FORWARD), 1)
 ifneq ($(ONLY_CBLAS), 1)
 ifeq ($(GEMM_GEMV_FORWARD), 1)
 CCOMMON_OPT += -DGEMM_GEMV_FORWARD
 endif
 ifeq ($(GEMM_GEMV_FORWARD_BF16), 1)
 CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16
 endif
 endif

 # This operation is expensive, so execution should be once.
@@ -376,9 +380,6 @@ OBJCONV = $(CROSS_SUFFIX)objconv
 ifeq ($(NOFORTRAN), 1)
 C_LAPACK = 1
 override FEXTRALIB = 
 ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -Wno-error=incompatible-pointer-types
 endif
 endif

 ifeq ($(C_COMPILER), GCC)
@@ -445,7 +446,7 @@ endif

 ifeq ($(OSNAME), Linux)
 EXTRALIB	+= -lm
 NO_EXPRECISION = 1
 #NO_EXPRECISION = 1
 endif

 ifeq ($(OSNAME), Android)
@@ -571,7 +572,7 @@ NO_BINARY_MODE	= 1
 endif

 ifeq ($(CORE), generic)
 NO_EXPRECISION = 1
 #NO_EXPRECISION = 1
 endif

 ifndef NO_EXPRECISION
@@ -594,7 +595,7 @@ endif
 ifeq ($(ARCH), x86_64)

 ifeq ($(CORE), generic)
 NO_EXPRECISION = 1
 #NO_EXPRECISION = 1
 endif

 ifndef NO_EXPRECISION
@@ -730,7 +731,7 @@ endif
 endif

 ifeq ($(ARCH), loongarch64)
 DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
 DYNAMIC_CORE = LA64_GENERIC LA264 LA464
 endif

 ifeq ($(ARCH), riscv64)
@@ -827,8 +828,8 @@ BINARY_DEFINED	= 1

 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), GCC)
 # EXPRECISION	= 1
 # CCOMMON_OPT	+= -DEXPRECISION
 EXPRECISION	= 1
 CCOMMON_OPT	+= -DEXPRECISION
 endif
 endif
 endif
@@ -1612,6 +1613,13 @@ NO_AFFINITY = 1
 endif
 endif

 ifeq ($(ARCH), POWER)
 ifeq ($(DEBUG), 1)
 CCOMMON_OPT := $(filter-out -O%, $(CCOMMON_OPT)) -O0
 FCOMMON_OPT := $(filter-out -O%, $(FCOMMON_OPT)) -O0
 endif
 endif

 ifdef NO_AFFINITY
 ifeq ($(NO_AFFINITY), 0)
 override undefine NO_AFFINITY
@@ -1723,8 +1731,8 @@ LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx
 override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 endif
 ifeq ($(F_COMPILER),FLANGNEW)
 LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
 override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
 endif

 LAPACK_CFLAGS = $(CFLAGS)
--- a/README.md
+++ b/README.md
@@ -2,12 +2,8 @@

 [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)

 Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS)

 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)

 Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
 <!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->



 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
@@ -19,11 +15,14 @@ OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=O

 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.

 Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
 For more information about OpenBLAS, please see:

 - The documentation at [openmathlib.org/OpenBLAS/docs/](http://www.openmathlib.org/OpenBLAS/docs),
 - The home page at [openmathlib.org/OpenBLAS/](http://www.openmathlib.org/OpenBLAS).

 For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
 <https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
 20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
 20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare [here](https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/) or YouTube [here](https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek) may be helpful.

 ## Binary Packages

@@ -31,24 +30,29 @@ We provide official binary packages for the following platform:

  * Windows x86/x86_64

 You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
 You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the [Releases section of the GitHub project page](https://github.com/OpenMathLib/OpenBLAS/releases).

 OpenBLAS is also packaged for many package managers - see [the installation section of the docs](http://www.openmathlib.org/OpenBLAS/docs/install/) for details.

 ## Installation from Source

 Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
 using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
 sure to use the develop branch - master is several years out of date due to a change of maintainership.)
 Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
 Most can also be given directly on the make or cmake command line.
 Obtain the source code from https://github.com/OpenMathLib/OpenBLAS/. Note that the default branch
 is `develop` (a `master` branch is still present, but far out of date).

 Build-time parameters can be chosen in `Makefile.rule`, see there for a short description of each option.
 Most options can also be given directly on the command line as parameters to your `make` or `cmake` invocation.

 ### Dependencies

 Building OpenBLAS requires the following to be installed:

 * GNU Make
 * A C compiler, e.g. GCC or Clang
 * GNU Make or CMake
 * A C compiler, e.g. GCC or Clang 
 * A Fortran compiler (optional, for LAPACK)
 * IBM MASS (optional, see below)

 In general, using a recent version of the compiler is strongly recommended.
 If a Fortran compiler is not available, it is possible to compile an older version of the included LAPACK
 that has been machine-translated to C.

 ### Normal compile

@@ -64,26 +68,31 @@ For building with `cmake`, the usual conventions apply, i.e. create a build dire
 OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any 
 build options you plan to set.

 For more details, see the [Building from source](http://www.openmathlib.org/OpenBLAS/docs/install/#building-from-source)
 section in the docs.

 ### Cross compile

 Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
 Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler.
 The target must be specified explicitly when cross compiling.

 Examples:

 * On an x86 box, compile this library for a loongson3a CPU:
 * On a Linux system, cross-compiling to an older MIPS64 router board:
  ```sh
  make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
  make BINARY=64 CC=mipsisa64r6el-linux-gnuabi64-gcc FC=mipsisa64r6el-linux-gnuabi64-gfortran HOSTCC=gcc TARGET=P6600
  ```
  or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI:
 *  or to a Windows x64 host: 
  ```sh
  make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A
  make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc
  ```

 * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
  ```sh
  make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu-   NO_LAPACKE=1 NO_SHARED=1 BINARY=32
  ```
 You can find instructions for other cases both in the "Supported Systems" section below and in
 the [Building from source docs](http://www.openmathlib.org/OpenBLAS/docs/install).
 The `.yml` scripts included with the sources (which contain the
 build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints.

 When compiling for a more modern CPU target of the same architecture, e.g. `TARGET=SKYLAKEX` on a `HASWELL` host, option `CROSS=1` can be used to suppress the automatic invocation of the tests at the end of the build.

 ### Debug version

@@ -219,6 +228,26 @@ e.g.:
    HOSTCC=gcc HOSTFC=gfortran -j
  ```

 #### LOONGARCH64

 - **LA64_GENERIC**: Optimized Level-3, Level-2 and Level-1 BLAS with scalar instruction
  ```sh
  make HOSTCC=gcc TARGET=LA64_GENERIC CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
  ```
  The old-style TARGET=LOONGSONGENERIC is still supported

 - **LA264**: Optimized Level-3, Level-2 and Level-1 BLAS with LSX instruction
  ```sh
  make HOSTCC=gcc TARGET=LA264 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
  ```
  The old-style TARGET=LOONGSON2K1000 is still supported

 - **LA464**: Optimized Level-3, Level-2 and Level-1 BLAS with LASX instruction
  ```sh
  make HOSTCC=gcc TARGET=LA464 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
  ```
  The old-style TARGET=LOONGSON3R5 is still supported

 ### Support for multiple targets in a single library

 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
@@ -236,8 +265,12 @@ on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.

 On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support.  A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled.

 The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
 On **LoongArch64**, it comprises LA264 and LA464 as well as generic LoongArch64 support.

 The `TARGET` option can - and usually **should** - be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter.
 Failure to specify this may lead to advanced instructions being used by the compiler, just because the build host happens to support them. This is most likely to happen when aggressive optimization options are in effect, and the resulting library may then crash with an
 illegal instruction error on weaker hardware, before it even reaches the BLAS routines specifically included for that cpu.

 Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.

 ### Supported OS
@@ -291,24 +324,28 @@ If you compile this library with `USE_OPENMP=1`, you should use the above functi

 ## Reporting bugs

 Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
 Please submit an issue in https://github.com/OpenMathLib/OpenBLAS/issues.

 ## Contact

 + Use github discussions: https://github.com/OpenMathLib/OpenBLAS/discussions
 * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
 * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev

 ## Change log

 Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
 Please see Changelog.txt.

 ## Troubleshooting

 * Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
 * Please read the [FAQ](http://www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first.
 * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
 * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
  Clang 3.0 will generate the wrong AVX binary code.
 * Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
 * Please use GCC version 6 or LLVM version 6 and above to compile Skylake/CooperLake AVX512 kernels
 * Please use LLVM version 18 and above (version 19 and above on Windows) if you plan to use
  its new flang compiler for Fortran
 * Please use GCC version 11 and above to compile OpenBLAS on the POWER architecture
 * The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
  there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
  the library with `BIGNUMA=1`.
@@ -321,12 +358,12 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2

 ## Contributing

 1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
 1. [Check for open issues](https://github.com/OpenMathLib/OpenBLAS/issues) or open a fresh issue
   to start a discussion around a feature idea or a bug.
 2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
 2. Fork the [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) repository to start making your changes.
 3. Write a test which shows that the bug was fixed or that the feature works as expected.
 4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.

 ## Donation

 Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
 Please see [the donations section](http://www.openmathlib.org/OpenBLAS/docs/about/#donations) in the docs.
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -126,9 +126,17 @@ x280
 RISCV64_ZVL256B

 11.LOONGARCH64:
 // LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names,
 // and it is recommended to use the more standardized naming conventions
 // LA64_GENERIC/LA264/LA464. You can still specify TARGET as
 // LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime,
 // and they will be internally relocated to LA64_GENERIC/LA264/LA464.
 LOONGSONGENERIC
 LOONGSON3R5
 LOONGSON2K1000
 LOONGSON3R5
 LA64_GENERIC
 LA264
 LA464

 12. Elbrus E2000:
 E2K
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -212,7 +212,7 @@ jobs:
     vmImage: 'macOS-latest'
  variables:
     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/edb4dc2f-266f-47f2-8d56-21bc7764e119/m_HPCKit_p_2023.2.0.49443.dmg
     LIBRARY_PATH: /usr/local/opt/llvm/lib
     MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
  steps:   
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -103,6 +103,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
       somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)

 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@@ -276,6 +277,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
       samin.goto damin.goto camin.goto zamin.goto \
       smin.goto dmin.goto \
       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
       somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \
       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)

 acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
@@ -2906,6 +2908,29 @@ dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
 dznrm2.atlas : dznrm2.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)

 ###################################################################################################

 ############################################ SOMATCOPY ############################################
 somatcopy.goto : somatcopy.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

 ###################################################################################################

 ############################################ DOMATCOPY ############################################
 domatcopy.goto : domatcopy.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

 ###################################################################################################

 ############################################ COMATCOPY ############################################
 comatcopy.goto : comatcopy.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

 ###################################################################################################

 ############################################ ZOMATCOPY ############################################
 zomatcopy.goto : zomatcopy.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

 ###################################################################################################

@@ -3435,6 +3460,18 @@ scnrm2.$(SUFFIX) : nrm2.c
 dznrm2.$(SUFFIX) : nrm2.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

 somatcopy.$(SUFFIX) : omatcopy.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

 domatcopy.$(SUFFIX) : omatcopy.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

 comatcopy.$(SUFFIX) : omatcopy.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

 zomatcopy.$(SUFFIX) : omatcopy.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^


 smallscaling: smallscaling.c ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
@@ -3442,4 +3479,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling

 include $(TOPDIR)/Makefile.tail
 include $(TOPDIR)/Makefile.tail
--- a/benchmark/omatcopy.c
+++ b/benchmark/omatcopy.c
@@ -0,0 +1,122 @@
 /***************************************************************************
 Copyright (c) 2024, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "bench.h"

 #undef OMATCOPY

 #ifndef COMPLEX
 #ifdef DOUBLE
 #define OMATCOPY BLASFUNC(domatcopy)
 #else
 #define OMATCOPY BLASFUNC(somatcopy)
 #endif
 #else
 #ifdef DOUBLE
 #define OMATCOPY BLASFUNC(zomatcopy)
 #else
 #define OMATCOPY BLASFUNC(comatcopy)
 #endif
 #endif
 int main(int argc, char *argv[]){
  FLOAT *a, *b;
  FLOAT alpha[] = {1.0, 0.0};
  char trans = 'N';
  char order = 'C';
  blasint crows, ccols, clda, cldb;
  int loops = 1;
  char *p;

  int from =   1;
  int to   = 200;
  int step =   1;
  int i, j;

  double time1, timeg;

  argc--;argv++;

  if (argc > 0) { from = atol(*argv);            argc--; argv++; }
  if (argc > 0) { to   = MAX(atol(*argv), from); argc--; argv++; }
  if (argc > 0) { step = atol(*argv);            argc--; argv++; }

  if ((p = getenv("OPENBLAS_TRANS"))) {
    trans=*p;
  }
  if ((p = getenv("OPENBLAS_ORDER"))) {
    order=*p;
  }
  TOUPPER(trans);
  TOUPPER(order);
  fprintf(stderr, "From : %3d  To : %3d Step=%d : Trans=%c : Order=%c\n", from, to, step, trans, order);
  p = getenv("OPENBLAS_LOOPS");
  if ( p != NULL ) {
    loops = atoi(p);
  }

  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }
  if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

 #ifdef __linux
  srandom(getpid());
 #endif

  for (i = 0; i < to * to * COMPSIZE; i++) {
    a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
  }
  for (i = 0; i < to * to * COMPSIZE; i++) {
    b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
  }

  fprintf(stderr, "          SIZE                   Flops             Time\n");
  for (i = from; i <= to; i += step) {
    cldb = clda = crows = ccols = i;
    fprintf(stderr, " ROWS=%4d, COLS=%4d : ", (int)crows, (int)ccols);
    begin();

    for (j=0; j<loops; j++) {
      OMATCOPY (&order, &trans, &crows, &ccols, alpha, a, &clda, b, &cldb);
    }

    end();
    time1 = getsec();

    timeg = time1/loops;
    fprintf(stderr,
 	    " %10.2f MFlops %10.6f sec\n",
 	    COMPSIZE * COMPSIZE * (double)ccols * (double)crows / timeg * 1.e-6, time1);
  }

  free(a);
  free(b);

  return 0;
 }
--- a/benchmark/pybench/README.md
+++ b/benchmark/pybench/README.md
@@ -43,7 +43,17 @@ have all what it takes to build OpenBLAS from source, plus `python` and
 $ python -mpip install numpy meson ninja pytest pytest-benchmark
 ```

 The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`.
 The Meson build system looks for the installed OpenBLAS using pkgconfig, so the openblas.pc created during the OpenBLAS build needs
 to be somewhere on the search path of pkgconfig or in a folder pointed to by the environment variable PKG_CONFIG_PATH.

 If you want to build the benchmark suite using flang (or flang-new) instead of gfortran for the Fortran parts, you currently need
 to edit the meson.build file and change the line `'fortran_std=legacy'` to `'fortran_std=none'` to work around an incompatibility
 between Meson and flang.

 If you are building and running the benchmark under MS Windows, it may be necessary to copy the generated openblas_wrap module from
 your build folder to the `benchmarks` folder.

 The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/bench_blas.py`.

 An ASV compatible benchmark suite is planned but currently not implemented.

--- a/+ 3
+++ b/+ 3
@@ -6,6 +6,9 @@ hostarch=`uname -m | sed -e 's/i.86/x86/'`
 if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then
    hostarch=`uname -p`
 fi
 if [ "$hostarch" = "evbarm" ]; then
    hostarch=`uname -p`
 fi
 case "$hostarch" in
    amd64) hostarch=x86_64 ;;
    arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;;
--- a/cblas.h
+++ b/cblas.h
@@ -407,13 +407,13 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
 void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 

 void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, 
 void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
 void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, 
 void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 
 void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, 
 void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
 void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, 
 void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 

 void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -94,6 +94,10 @@ if (DYNAMIC_ARCH)
    endif ()
  endif ()

  if (LOONGARCH64)
    set(DYNAMIC_CORE LA64_GENERIC LA264 LA464)
  endif ()

  if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)
 	  message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again")
  endif ()
--- a/cmake/f_check.cmake
+++ b/cmake/f_check.cmake
@@ -45,13 +45,15 @@ if (NOT ONLY_CBLAS)

  # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile
  # TODO: set FEXTRALIB flags a la f_check?

  if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND x${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM"))
  set(BU "_")
  file(APPEND ${TARGET_CONF_TEMP}
    "#define BUNDERSCORE _\n"
    "#define NEEDBUNDERSCORE 1\n"
    "#define NEED2UNDERSCORES 0\n")

  else ()
 	  set (FCOMMON_OPT "${FCOMMON_OPT} /fp:precise /recursive /names:lowercase /assume:nounderscore")
  endif()
 else ()

  #When we only build CBLAS, we set NOFORTRAN=2
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -61,21 +61,25 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
    endif ()
    if (LOONGARCH64)
      if (BINARY64)
 	CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
        if(COMPILER_SUPPORT_LP64D_ABI)
 	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
 	else()
 	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
 	endif ()
        if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
 	  CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
          if(COMPILER_SUPPORT_LP64D_ABI)
 	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
 	  else()
 	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
 	  endif ()
        endif ()
        if (INTERFACE64) 
          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") 
        endif () 
      else ()
 	CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
 	if(COMPILER_SUPPORT_ILP32D_ABI)
 	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
 	else()
 	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
        if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
 	  CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
 	  if(COMPILER_SUPPORT_ILP32D_ABI)
 	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
 	  else()
 	    set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
          endif ()
 	endif ()
      endif ()
    endif ()
@@ -253,13 +257,40 @@ if (${F_COMPILER} STREQUAL "COMPAQ")
 endif ()

 if (${F_COMPILER} STREQUAL "CRAY")
  set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL")
 	set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_CRAYFC")
  set(FCOMMON_OPT "${FCOMMON_OPT} -hnopattern")
  if (INTERFACE64)
    set (FCOMMON_OPT "${FCOMMON_OPT} -s integer64")
  endif ()
  if (NOT USE_OPENMP)
    set(FCOMMON_OPT "${FCOMMON_OPT} -O noomp")
    set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp")
  else ()
    set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
  endif ()
 endif ()

 if (${F_COMPILER} STREQUAL "NAGFOR")
  set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_NAG")
  if (INTERFACE64)
    set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
  endif ()
  # Options from Makefile.system
  # -dcfuns: Enable non-standard double precision complex intrinsic functions
  # -ieee=full: enables all IEEE arithmetic facilities including non-stop arithmetic.
  # -w=obs: Suppress warning messages about obsolescent features
  # -thread_safe: Compile code for safe execution in a multi-threaded environment.
  # -recursive: Specifies that procedures are RECURSIVE by default.
  set(FCOMMON_OPT "${FCOMMON_OPT} -dcfuns -recursive -ieee=full -w=obs -thread_safe")
  # Options from Reference-LAPACK
  # Suppress compiler banner and summary
  set(FCOMMON_OPT "${FCOMMON_OPT} -quiet")
  # Disable other common warnings
  # -w=x77: Suppress warning messages about Fortran 77 features
  # -w=ques: Suppress warning messages about questionable usage
  # -w=unused: Suppress warning messages about unused variables
  set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused")
  if (USE_OPENMP)
    set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
  endif ()
 endif ()

--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -1018,7 +1018,12 @@ foreach (LA_FILE ${LA_GEN_SRC})
 endforeach ()

 if (NOT C_LAPACK)
  set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
  # The below line is duplicating Fortran flags but NAG has a few flags
  # that cannot be specified twice. It's possible this is not needed for
  # any compiler, but for safety, we only turn off for NAG
  if (NOT ${F_COMPILER} STREQUAL "NAGFOR")
    set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
  endif ()
  if (${F_COMPILER} STREQUAL "GFORTRAN")
    set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize")
  endif()
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -9,5 +9,5 @@ Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: @OpenBLAS_VERSION@
 URL: https://github.com/OpenMathLib/OpenBLAS
 Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} 
 Cflags: -I${includedir}
 Libs: -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} 
 Cflags: -I${includedir} @OpenMP_C_FLAGS@ 
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -58,7 +58,7 @@ set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp")

 # c_check
 set(FU "")
 if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
 if (APPLE OR (MSVC AND NOT (${CMAKE_C_COMPILER_ID} MATCHES "Clang" OR ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")))
  set(FU "_")
 endif()
 if(MINGW AND NOT MINGW64)
@@ -1349,6 +1349,54 @@ endif ()
      "#define DTB_DEFAULT_ENTRIES 128\n"
      "#define DTB_SIZE 4096\n"
      "#define L2_ASSOCIATIVE 4\n")
  elseif ("${TCORE}" STREQUAL "LA64_GENERIC")
    file(APPEND ${TARGET_CONF_TEMP}
      "#define DTB_DEFAULT_ENTRIES 64\n")
      set(SGEMM_UNROLL_M 2)
      set(SGEMM_UNROLL_N 8)
      set(DGEMM_UNROLL_M 2)
      set(DGEMM_UNROLL_N 8)
      set(CGEMM_UNROLL_M 1)
      set(CGEMM_UNROLL_N 4)
      set(ZGEMM_UNROLL_M 1)
      set(ZGEMM_UNROLL_N 4)
      set(CGEMM3M_UNROLL_M 2)
      set(CGEMM3M_UNROLL_N 8)
      set(ZGEMM3M_UNROLL_M 2)
      set(ZGEMM3M_UNROLL_N 8)
  elseif ("${TCORE}" STREQUAL "LA264")
    file(APPEND ${TARGET_CONF_TEMP}
      "#define DTB_DEFAULT_ENTRIES 64\n")
      set(HAVE_LSX  1)
      set(SGEMM_UNROLL_M 2)
      set(SGEMM_UNROLL_N 8)
      set(DGEMM_UNROLL_M 8)
      set(DGEMM_UNROLL_N 4)
      set(CGEMM_UNROLL_M 8)
      set(CGEMM_UNROLL_N 4)
      set(ZGEMM_UNROLL_M 4)
      set(ZGEMM_UNROLL_N 4)
      set(CGEMM3M_UNROLL_M 2)
      set(CGEMM3M_UNROLL_N 8)
      set(ZGEMM3M_UNROLL_M 8)
      set(ZGEMM3M_UNROLL_N 4)
  elseif ("${TCORE}" STREQUAL "LA464")
    file(APPEND ${TARGET_CONF_TEMP}
      "#define DTB_DEFAULT_ENTRIES 64\n")
      set(HAVE_LASX 1)
      set(HAVE_LSX  1)
      set(SGEMM_UNROLL_M 16)
      set(SGEMM_UNROLL_N 8)
      set(DGEMM_UNROLL_M 16)
      set(DGEMM_UNROLL_N 6)
      set(CGEMM_UNROLL_M 16)
      set(CGEMM_UNROLL_N 4)
      set(ZGEMM_UNROLL_M 8)
      set(ZGEMM_UNROLL_N 4)
      set(CGEMM3M_UNROLL_M 16)
      set(CGEMM3M_UNROLL_N 8)
      set(ZGEMM3M_UNROLL_M 16)
      set(ZGEMM3M_UNROLL_N 6)
  endif()
  set(SBGEMM_UNROLL_M 8)
  set(SBGEMM_UNROLL_N 4)
@@ -1385,7 +1433,9 @@ else(NOT CMAKE_CROSSCOMPILING)
    message(STATUS "MSVC")
    set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
  else()
    list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
    if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
      list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
    endif()
    if (DEFINED TARGET_CORE)
    set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
  endif ()
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -263,7 +263,7 @@ if (DEFINED TARGET)
  endif()

  if (${TARGET} STREQUAL POWER10)
    if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2)
    if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2)
      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
    else ()
 	    message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.")
@@ -382,13 +382,15 @@ if (NEED_PIC)
  if (NOT NOFORTRAN)
    if (${F_COMPILER} STREQUAL "SUN")
      set(FCOMMON_OPT "${FCOMMON_OPT} -pic")
    elseif (${F_COMPILER} STREQUAL "NAGFOR")
      set(FCOMMON_OPT "${FCOMMON_OPT} -PIC")
    else ()
      set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC")
    endif ()
  endif()
 endif ()

 if (X86_64 OR ${CORE} STREQUAL POWER10)
 if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64 OR LOONGARCH64)
  set(SMALL_MATRIX_OPT TRUE)
 endif ()
 if (ARM64)
@@ -398,12 +400,15 @@ endif ()
 if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS)
  set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD")
 endif ()
 if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS)
    set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16")
 endif ()
 if (SMALL_MATRIX_OPT)
  set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
 endif ()

 if (DYNAMIC_ARCH)
  if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64)
  if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64)
    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
    if (DYNAMIC_OLDER)
      set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@@ -637,17 +642,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
 endif ()

 if (CMAKE_Fortran_COMPILER)
 if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
  set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
  if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
 message(STATUS "removing fortran flags")
    set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
  if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
    set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
    if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
      message(STATUS "removing fortran flags")
      set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
    endif ()
    foreach (FILTER_FLAG ${FILTER_FLAGS})
      string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
      string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
    endforeach ()
  endif ()
  foreach (FILTER_FLAG ${FILTER_FLAGS})
    string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
    string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
  endforeach ()
 endif ()
 endif ()

 if ("${F_COMPILER}" STREQUAL "GFORTRAN")
@@ -667,6 +672,9 @@ endif ()
 if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
  set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
 endif ()
 if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
 	set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE")
 endif ()

 if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
 if ("${F_COMPILER}" STREQUAL "FLANG")
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -10,6 +10,10 @@ if (${HOST_OS} STREQUAL "WINDOWS")
  set(HOST_OS WINNT)
 endif ()

 if (${HOST_OS} STREQUAL "IOS")
  set(HOST_OS DARWIN)
 endif ()

 if (${HOST_OS} STREQUAL "LINUX")
 # check if we're building natively on Android (TERMUX)
    EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
@@ -104,6 +108,8 @@ elseif(ARM)
  set(ARCH "arm")
 elseif(ARM64)
  set(ARCH "arm64")
 elseif(LOONGARCH64)
  set(ARCH "loongarch64")
 else()
  set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture")
 endif ()
--- a/common.h
+++ b/common.h
@@ -372,6 +372,12 @@ typedef int blasint;
 #endif
 #endif

 #if defined(ARCH_RISCV64)
 #ifndef YIELDING
 #define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
 #endif
 #endif


 #ifdef __EMSCRIPTEN__
 #define YIELDING
--- a/common_arm.h
+++ b/common_arm.h
@@ -47,8 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #endif

 #define INLINE inline

 #define RETURN_BY_COMPLEX

 #ifndef ASSEMBLER
@@ -104,9 +102,16 @@ static inline int blas_quickdivide(blasint x, blasint y){

 #if defined(ASSEMBLER) && !defined(NEEDPARAM)

 #if !defined(__APPLE__) && !defined(_WIN32)
 #define OPENBLAS_ARM_TYPE_FUNCTION .type	REALNAME, %function ;
 #else
 #define OPENBLAS_ARM_TYPE_FUNCTION
 #endif

 #define PROLOGUE \
 	.arm		 ;\
 	.global	REALNAME ;\
 	OPENBLAS_ARM_TYPE_FUNCTION \
 REALNAME:

 #define EPILOGUE
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -44,9 +44,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RMB  __asm__ __volatile__ ("dmb  ishld" : : : "memory")
 #endif

 #define INLINE inline

 #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
 #if defined( F_INTERFACE_FLANG) || (defined(F_INTERFACE_PGI) && (defined(__NVCOMPILER) && (__NVCOMPILER_MAJOR__ < 23 || (__NVCOMPILER_MAJOR__ == 23 && __NVCOMPILER_MINOR__ < 9))))
 #define RETURN_BY_STACK
 #else
 #define RETURN_BY_COMPLEX
@@ -55,6 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER


 #ifndef NO_AFFINITY
 static __inline int WhereAmI(void){
  uint64_t ret;
  __asm__ volatile (
@@ -67,6 +66,7 @@ static __inline int WhereAmI(void){
  if ((int)ret <0) ret = 0;
  return (int)ret;
 }
 #endif

 static __inline void blas_lock(volatile BLASULONG *address){

--- a/common_e2k.h
+++ b/common_e2k.h
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
 #define RMB

 #define INLINE __attribute__((__always_inline__)) inline

 static inline int blas_quickdivide(blasint x, blasint y) {
  return x / y;
 }
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

 #define INLINE inline

 #ifndef ASSEMBLER

 static inline int blas_quickdivide(blasint x, blasint y){
@@ -281,9 +279,13 @@ REALNAME: ;\
 #define GNUSTACK
 #endif /* defined(__linux__) && defined(__ELF__) */

 #ifdef __clang__
 #define EPILOGUE .end
 #else
 #define EPILOGUE      \
    .end    REALNAME ;\
    GNUSTACK
 #endif

 #define PROFCODE

--- a/common_mips.h
+++ b/common_mips.h
@@ -37,8 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

 #define INLINE inline

 #define RETURN_BY_COMPLEX

 #ifndef ASSEMBLER
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

 #define INLINE inline

 #ifndef ASSEMBLER

 static inline unsigned int rpcc(void){
--- a/common_power.h
+++ b/common_power.h
@@ -78,8 +78,6 @@
 #define RMB		__asm__ __volatile__ ("sync")
 #endif

 #define INLINE inline

 #ifdef PPC440
 #define STDERR stdout
 #define QNONCACHE 0x1
@@ -91,7 +89,7 @@

 void *qalloc(int flags, size_t bytes);

 static INLINE void blas_lock(volatile unsigned long *address){
 static inline void blas_lock(volatile unsigned long *address){

  long int ret, val = 1;

--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB __sync_synchronize()
 #define RMB __sync_synchronize()

 #define INLINE inline

 #ifndef ASSEMBLER


--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -283,6 +283,10 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
 #define RETURN_BY_STACK
 #endif

 #ifdef F_INTERFACE_CRAYFC
 #define RETURN_BY_PACKED
 #endif

 #ifdef F_INTERFACE_FUJITSU
 #define RETURN_BY_STACK
 #endif
--- a/common_zarch.h
+++ b/common_zarch.h
@@ -37,9 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define WMB  
 #define RMB


 #define INLINE inline

 #define RETURN_BY_COMPLEX

 #ifndef ASSEMBLER
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -25,6 +25,7 @@
  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/

 #include <stdlib.h>
 #include <string.h>
 #ifdef __APPLE__
 #include <sys/sysctl.h>
@@ -33,6 +34,20 @@ size_t length=sizeof(value);
 int64_t value64;
 size_t length64=sizeof(value64);
 #endif
 #if (defined OS_LINUX || defined OS_ANDROID)
 #include <asm/hwcap.h>
 #include <sys/auxv.h>
 #ifndef HWCAP_CPUID
 #define HWCAP_CPUID (1 << 11)
 #endif
 #ifndef HWCAP_SVE
 #define HWCAP_SVE (1 << 22)
 #endif

 #define get_cpu_ftr(id, var) ({                                 \
                __asm__ __volatile__ ("mrs %0, "#id : "=r" (var));              \
        })
 #endif

 #define CPU_UNKNOWN     	0
 #define CPU_ARMV8       	1
@@ -42,11 +57,11 @@ size_t length64=sizeof(value64);
 #define CPU_CORTEXA57     3
 #define CPU_CORTEXA72     4
 #define CPU_CORTEXA73     5
 #define CPU_CORTEXA76	  23
 #define CPU_CORTEXA76    23
 #define CPU_NEOVERSEN1    11
 #define CPU_NEOVERSEV1    16
 #define CPU_NEOVERSEN2    17
 #define CPU_NEOVERSEV2   24
 #define CPU_NEOVERSEV2    24
 #define CPU_CORTEXX1      18
 #define CPU_CORTEXX2	  19
 #define CPU_CORTEXA510	  20
@@ -93,7 +108,7 @@ static char *cpuname[] = {
  "CORTEXA710",
  "FT2000",
  "CORTEXA76",
 	"NEOVERSEV2"
  "NEOVERSEV2"
 };

 static char *cpuname_lower[] = {
@@ -121,13 +136,17 @@ static char *cpuname_lower[] = {
  "cortexa710",
  "ft2000",
  "cortexa76",
 	"neoversev2"
  "neoversev2"
 };

 static int cpulowperf=0;
 static int cpumidperf=0;
 static int cpuhiperf=0;

 int get_feature(char *search)
 {

 #ifdef __linux
 #if defined( __linux ) || defined( __NetBSD__ )
 	FILE *infile;
  	char buffer[2048], *p,*t;
  	p = (char *) NULL ;
@@ -158,33 +177,108 @@ int get_feature(char *search)
 #endif
 	return(0);
 }

 static int cpusort(const void *model1, const void *model2)
 {
 	return (*(int*)model2-*(int*)model1);
 }

 int detect(void)
 {

 #ifdef __linux

 #if defined( __linux ) || defined( __NetBSD__ )
 	int n,i,ii;
 	int midr_el1;
 	int implementer;
 	int cpucap[1024];
 	int cpucores[1024];
 	FILE *infile;
 	char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
 	char cpupart[6],cpuimpl[6];
 	char *cpu_impl=NULL,*cpu_pt=NULL;
 	char buffer[2048], *p, *cpu_part = NULL, *cpu_implementer = NULL;
 	p = (char *) NULL ;

 	infile = fopen("/proc/cpuinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile)) {
 		if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
 			break;
 	cpulowperf=cpumidperf=cpuhiperf=0;
 	for (i=0;i<1024;i++)cpucores[i]=0;
 	n=0;
 	infile = fopen("/sys/devices/system/cpu/possible", "r");
 	if (!infile) {
 		infile = fopen("/proc/cpuinfo", "r");
 		while (fgets(buffer, sizeof(buffer), infile)) {
 			if (!strncmp("processor", buffer, 9))
 			n++;
 		}

 		if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
 			cpu_part = strchr(buffer, ':') + 2;
 			cpu_part = strdup(cpu_part);
 		} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
 			cpu_implementer = strchr(buffer, ':') + 2;
 			cpu_implementer = strdup(cpu_implementer);
 	} else {
 		fgets(buffer, sizeof(buffer), infile);
 		sscanf(buffer,"0-%d",&n);
 		n++;
 	}
        fclose(infile);

 	cpu_implementer=NULL;
 	for (i=0;i<n;i++){
 		sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i);
 		infile= fopen(buffer,"r");
 		if (!infile) {
 			infile = fopen("/proc/cpuinfo", "r");
 			for (ii=0;ii<n;ii++){
 				cpu_part=NULL;cpu_implementer=NULL;
 				while (fgets(buffer, sizeof(buffer), infile)) {
 					if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
 						break;
 					}

 					if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
 					cpu_pt = strchr(buffer, ':') + 2;
 					cpu_part = strdup(cpu_pt);
 					cpucores[i]=strtol(cpu_part,NULL,0);

 					} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
 					cpu_impl = strchr(buffer, ':') + 2;
 					cpu_implementer = strdup(cpu_impl);
 					}

 				}
 				if (strstr(cpu_implementer, "0x41")) {
 					if (cpucores[ii] >= 0xd4b) cpuhiperf++;
 				else
 					if (cpucores[ii] >= 0xd07) cpumidperf++;
 				else cpulowperf++;
 				}
 				else cpulowperf++;
 			}
 			fclose(infile);
 			break;
 		} else {
 		        (void)fgets(buffer, sizeof(buffer), infile);
 			midr_el1=strtoul(buffer,NULL,16);
 			fclose(infile);
 			implementer = (midr_el1 >> 24) & 0xFF;
 			cpucores[i] = (midr_el1 >> 4)  & 0xFFF;
 		sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capacity",i);
 		infile= fopen(buffer,"r");
 		if (!infile) {
 				if (implementer== 65) {
 					if (cpucores[i] >= 0xd4b) cpuhiperf++;
 				else
 					if (cpucores[i] >= 0xd07) cpumidperf++;
 				else cpulowperf++;
 				}
 				else cpulowperf++;
 			} else {
 		        (void)fgets(buffer, sizeof(buffer), infile);
 				sscanf(buffer,"%d",&cpucap[i]);
 					if (cpucap[i] >= 1000) cpuhiperf++;
 				else
 					if (cpucap[i] >= 500) cpumidperf++;
 				else cpulowperf++;
        		fclose(infile);
 			}
 		}
 		sprintf(cpuimpl,"0x%2x",implementer);
 		cpu_implementer=strdup(cpuimpl);
 	}

 	fclose(infile);
 	qsort(cpucores,1024,sizeof(int),cpusort);
 	sprintf(cpupart,"0x%3x",cpucores[0]);
 	cpu_part=strdup(cpupart);
 	if(cpu_part != NULL && cpu_implementer != NULL) {
    // Arm
    if (strstr(cpu_implementer, "0x41")) {
@@ -219,7 +313,7 @@ int detect(void)
      else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al.
        return CPU_NEOVERSEV2;
      else if (strstr(cpu_part, "0xd0b")) 
 	return CPU_CORTEXA76;
       return CPU_CORTEXA76;
    }
    // Qualcomm
    else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@@ -277,10 +371,20 @@ int detect(void)
 	}
 #else
 #ifdef __APPLE__
 	sysctlbyname("hw.ncpu",&value64,&length64,NULL,0);
 	cpulowperf=value64;
 	sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0);
 	if (value64 > 1) {
 	sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0);
 	cpuhiperf=value64;
 	sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0);
 	cpulowperf=value64;
 	}
 	sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
 	if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
 	if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
 	if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
        if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
        if (value64 == 1867590060) return CPU_VORTEX; //M4
 #endif
 	return CPU_ARMV8;	
 #endif
@@ -313,7 +417,7 @@ void get_cpucount(void)
 {
 int n=0;

 #ifdef __linux
 #if defined( __linux ) || defined( __NetBSD__ )
 	FILE *infile;
  	char buffer[2048], *p,*t;
  	p = (char *) NULL ;
@@ -330,10 +434,22 @@ int n=0;
  	fclose(infile);

 	printf("#define NUM_CORES %d\n",n);
 	if (cpulowperf >0)
 	printf("#define NUM_CORES_LP %d\n",cpulowperf);
 	if (cpumidperf >0)
 	printf("#define NUM_CORES_MP %d\n",cpumidperf);
 	if (cpuhiperf >0)
 	printf("#define NUM_CORES_HP %d\n",cpuhiperf);
 #endif
 #ifdef __APPLE__
 	sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
 	printf("#define NUM_CORES %d\n",value);
 	if (cpulowperf >0)
 	printf("#define NUM_CORES_LP %d\n",cpulowperf);
 	if (cpumidperf >0)
 	printf("#define NUM_CORES_MP %d\n",cpumidperf);
 	if (cpuhiperf >0)
 	printf("#define NUM_CORES_HP %d\n",cpuhiperf);
 #endif	
 }

@@ -346,7 +462,6 @@ void get_cpuconfig(void)
  printf("#define ARMV8\n");
  printf("#define HAVE_NEON\n"); // This shouldn't be necessary
  printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary

 	int d = detect();
 	switch (d)
 	{
@@ -401,7 +516,8 @@ void get_cpuconfig(void)
 		break;

 	    case CPU_NEOVERSEV1:
 	    case CPU_CORTEXA76:
                printf("#define HAVE_SVE 1\n");
            case CPU_CORTEXA76:
                printf("#define %s\n", cpuname[d]);
                printf("#define L1_CODE_SIZE 65536\n");
                printf("#define L1_CODE_LINESIZE 64\n");
@@ -429,29 +545,32 @@ void get_cpuconfig(void)
                printf("#define L2_ASSOCIATIVE 8\n");
                printf("#define DTB_DEFAULT_ENTRIES 48\n");
                printf("#define DTB_SIZE 4096\n");
                printf("#define HAVE_SVE 1\n");
                break;
      case CPU_NEOVERSEV2:
       case CPU_NEOVERSEV2:
                printf("#define ARMV9\n");
                printf("#define %s\n", cpuname[d]);
                printf("#define L1_CODE_SIZE 65536\n");
                printf("#define L1_CODE_LINESIZE 64\n");
                printf("#define L1_CODE_ASSOCIATIVE 4\n");
                printf("#define L1_DATA_SIZE 65536\n");
                printf("#define L1_DATA_LINESIZE 64\n");
                printf("#define L1_DATA_ASSOCIATIVE 4\n");
                printf("#define L2_SIZE 1048576\n");
                printf("#define L2_LINESIZE 64\n");
                printf("#define L2_ASSOCIATIVE 8\n");
 								// L1 Data TLB = 48 entries
 								// L2 Data TLB = 2048 entries
                printf("#define DTB_DEFAULT_ENTRIES 48\n");
                printf("#define DTB_SIZE 4096\n");  // Set to 4096 for symmetry with other configs.
                break;
                printf("#define HAVE_SVE 1\n");
                 printf("#define %s\n", cpuname[d]);
                 printf("#define L1_CODE_SIZE 65536\n");
                 printf("#define L1_CODE_LINESIZE 64\n");
                 printf("#define L1_CODE_ASSOCIATIVE 4\n");
                 printf("#define L1_DATA_SIZE 65536\n");
                 printf("#define L1_DATA_LINESIZE 64\n");
                 printf("#define L1_DATA_ASSOCIATIVE 4\n");
                 printf("#define L2_SIZE 1048576\n");
                 printf("#define L2_LINESIZE 64\n");
                 printf("#define L2_ASSOCIATIVE 8\n");
                                                                // L1 Data TLB = 48 entries
                                                                // L2 Data TLB = 2048 entries
                 printf("#define DTB_DEFAULT_ENTRIES 48\n");
                 printf("#define DTB_SIZE 4096\n");  // Set to 4096 for symmetry with other configs.
 		break;
 	    case CPU_CORTEXA510:
 	    case CPU_CORTEXA710:
 	    case CPU_CORTEXX1:
 	    case CPU_CORTEXX2:
 		printf("#define ARMV9\n");
                printf("#define HAVE_SVE 1\n");
                printf("#define %s\n", cpuname[d]);
                printf("#define L1_CODE_SIZE 65536\n");
                printf("#define L1_CODE_LINESIZE 64\n");
@@ -568,6 +687,7 @@ void get_cpuconfig(void)
 		break;
 	    case CPU_A64FX:
 		printf("#define A64FX\n");
                printf("#define HAVE_SVE 1\n");
    		printf("#define L1_CODE_SIZE 65535\n");
    		printf("#define L1_DATA_SIZE 65535\n");
    		printf("#define L1_DATA_LINESIZE 256\n");
@@ -600,7 +720,7 @@ void get_libname(void)
 void get_features(void)
 {

 #ifdef __linux
 #if defined( __linux ) || defined( __NetBSD__ )
 	FILE *infile;
  	char buffer[2048], *p,*t;
  	p = (char *) NULL ;
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -1,5 +1,5 @@
 /*****************************************************************************
 Copyright (c) 2011-2020, The OpenBLAS Project
 Copyright (c) 2011-2024, The OpenBLAS Project
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/

 #include <stdint.h>
 #include <sys/auxv.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
 #include <sys/auxv.h>

 /*  If LASX extension instructions supported,
 *  using core LOONGSON3R5
 *  If only LSX extension instructions supported,
 *  using core LOONGSON2K1000
 *  If neither LASX nor LSX extension instructions supported,
 *  using core LOONGSONGENERIC (As far as I know, there is no such
 *  CPU yet)
 */
 #define CPU_LA64_GENERIC     0
 #define CPU_LA264            1
 #define CPU_LA364            2
 #define CPU_LA464            3
 #define CPU_LA664            4

 #define CPU_GENERIC        0
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 #define CORE_LA64_GENERIC    0
 #define CORE_LA264           1
 #define CORE_LA464           2

 #define LA_HWCAP_LSX    (1U << 4)
 #define LA_HWCAP_LASX   (1U << 5)

 #define LOONGARCH_CFG0      0x00
 #define LOONGARCH_CFG2      0x02
 #define LOONGARCH_CFG10     0x10
 #define LOONGARCH_CFG11     0x11
 #define LOONGARCH_CFG12     0x12
 #define LOONGARCH_CFG13     0x13
 #define LOONGARCH_CFG14     0x14
 #define LASX_MASK           1<<7
 #define LSX_MASK            1<<6
 #define PRID_SERIES_MASK    0xf000
 #define PRID_SERIES_LA264   0xa000
 #define PRID_SERIES_LA364   0xb000
 #define PRID_SERIES_LA464   0xc000
 #define PRID_SERIES_LA664   0xd000

 #define CACHE_INFO_L1_IU    0
 #define CACHE_INFO_L1_D     1
 #define CACHE_INFO_L2_IU    2
 #define CACHE_INFO_L2_D     3
 #define CACHE_INFO_L3_IU    4
 #define CACHE_INFO_L3_D     5
 #define L1_IU_PRESENT_MASK  0x0001
 #define L1_IU_UNITY_MASK    0x0002
 #define L1_D_PRESENT_MASK   0x0004
 #define L2_IU_PRESENT_MASK  0x0008
 #define L2_IU_UNITY_MASK    0x0010
 #define L2_D_PRESENT_MASK   0x0080
 #define L3_IU_PRESENT_MASK  0x0400
 #define L3_IU_UNITY_MASK    0x0800
 #define L3_D_PRESENT_MASK   0x4000
 #define CACHE_WAY_MINUS_1_MASK      0x0000ffff
 #define CACHE_INDEX_LOG2_MASK       0x00ff0000
 #define CACHE_LINESIZE_LOG2_MASK    0x7f000000

 typedef struct {
  int size;
  int associative;
  int linesize;
  int unify;
  int present;
 } cache_info_t;

 /* Using microarchitecture representation */
 static char *cpuname[] = {
  "LOONGSONGENERIC",
  "LOONGSON3R5",
  "LOONGSON2K1000"
  "LA64_GENERIC",
  "LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */
  "LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */
  "LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */
  "LA664"  /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */
 };

 static char *cpuname_lower[] = {
  "loongsongeneric",
  "loongson3r5",
  "loongson2k1000"
  "la64_generic",
  "la264",
  "la364",
  "la464",
  "la664"
 };

 static char *corename[] = {
  "LA64_GENERIC", /* Implies using scalar instructions for optimization */
  "LA264", /* Implies using LSX  instructions for optimization */
  "LA464", /* Implies using LASX instructions for optimization */
 };

 static char *corename_lower[] = {
  "la64_generic",
  "la264",
  "la464",
 };

 int detect(void) {
 #ifdef __linux
 /*
 * Obtain cache and processor identification
 * through the cpucfg command.
 */
 static void get_cacheinfo(int type, cache_info_t *cacheinfo) {
  cache_info_t cache_info;
  memset(&cache_info, 0, sizeof(cache_info));
  uint32_t reg_10 = 0;
  __asm__ volatile (
    "cpucfg %0, %1 \n\t"
    : "+&r"(reg_10)
    : "r"(LOONGARCH_CFG10)
  );

  switch (type) {
    case CACHE_INFO_L1_IU:
      if (reg_10 & L1_IU_PRESENT_MASK) {
        uint32_t reg_11 = 0;
        cache_info.present = reg_10 & L1_IU_PRESENT_MASK;
        cache_info.unify   = reg_10 & L1_IU_UNITY_MASK;
        __asm__ volatile (
          "cpucfg %0, %1 \n\t"
          : "+&r"(reg_11)
          : "r"(LOONGARCH_CFG11)
        );
        cache_info.associative  = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1;
        cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24);
        cache_info.size = cache_info.associative * cache_info.linesize *
                          (1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16));
      }
    break;

    case CACHE_INFO_L1_D:
      if (reg_10 & L1_D_PRESENT_MASK) {
        uint32_t reg_12 = 0;
        cache_info.present = reg_10 & L1_D_PRESENT_MASK;
        __asm__ volatile (
          "cpucfg %0, %1 \n\t"
          : "+&r"(reg_12)
          : "r"(LOONGARCH_CFG12)
        );
        cache_info.associative  = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1;
        cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24);
        cache_info.size = cache_info.associative * cache_info.linesize *
                          (1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16));
      }
    break;

    case CACHE_INFO_L2_IU:
      if (reg_10 & L2_IU_PRESENT_MASK) {
        uint32_t reg_13 = 0;
        cache_info.present = reg_10 & L2_IU_PRESENT_MASK;
        cache_info.unify   = reg_10 & L2_IU_UNITY_MASK;
        __asm__ volatile (
          "cpucfg %0, %1 \n\t"
          : "+&r"(reg_13)
          : "r"(LOONGARCH_CFG13)
        );
        cache_info.associative  = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1;
        cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24);
        cache_info.size = cache_info.associative * cache_info.linesize *
                          (1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16));
      }
    break;

    case CACHE_INFO_L2_D:
      if (reg_10 & L2_D_PRESENT_MASK) {
        cache_info.present = reg_10 & L2_D_PRESENT_MASK;
        // No date fetch
      }
    break;

    case CACHE_INFO_L3_IU:
      if (reg_10 & L3_IU_PRESENT_MASK) {
        uint32_t reg_14 = 0;
        cache_info.present = reg_10 & L3_IU_PRESENT_MASK;
        cache_info.unify   = reg_10 & L3_IU_UNITY_MASK;
        __asm__ volatile (
          "cpucfg %0, %1 \n\t"
          : "+&r"(reg_14)
          : "r"(LOONGARCH_CFG14)
        );
        cache_info.associative  = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1;
        cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24);
        cache_info.size = cache_info.associative * cache_info.linesize *
                          (1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16));
      }
    break;

    case CACHE_INFO_L3_D:
      if (reg_10 & L3_D_PRESENT_MASK) {
        cache_info.present = reg_10 & L3_D_PRESENT_MASK;
        // No data fetch
      }
    break;

    default:
    break;
  }
  *cacheinfo = cache_info;
 }

 static uint32_t get_prid() {
  uint32_t reg = 0;
  __asm__ volatile (
    "cpucfg %0, %1 \n\t"
    : "+&r"(reg)
    : "r"(LOONGARCH_CFG0)
  );
  return reg;
 }

 static void get_cpucount(uint32_t *count) {
  uint32_t num = 0;
  FILE *f = fopen("/proc/cpuinfo", "r");
  if (!f) return;
  char buf[200];
  while (fgets(buf, sizeof(buf), f))
  {
    if (!strncmp("processor", buf, 9))
      num ++;
  }
  fclose(f);
  *count = num;
 }

 /* Detect whether the OS supports the LASX instruction set */
 static int os_support_lasx() {
  int hwcap  = (int)getauxval(AT_HWCAP);

  if (hwcap & LA_HWCAP_LASX)
    return CPU_LOONGSON3R5;
  else if (hwcap & LA_HWCAP_LSX)
    return CPU_LOONGSON2K1000;
    return 1;
  else
    return 0;
 }

 /* Detect whether the OS supports the LSX instruction set */
 static int os_support_lsx() {
  int hwcap  = (int)getauxval(AT_HWCAP);

  if (hwcap & LA_HWCAP_LSX)
    return 1;
  else
    return CPU_GENERIC;
 #endif
  return CPU_GENERIC;
    return 0;
 }

 int get_coretype(void) {
  uint32_t prid = get_prid();
  switch (prid & PRID_SERIES_MASK) {
    case (PRID_SERIES_LA464):
    case (PRID_SERIES_LA664):
      if (os_support_lasx())
        return CORE_LA464;
      else if (os_support_lsx())
        return CORE_LA264;
      else
        return CORE_LA64_GENERIC;
    break;

    case (PRID_SERIES_LA264):
    case (PRID_SERIES_LA364):
      if (os_support_lsx())
        return CORE_LA264;
      else
        return CORE_LA64_GENERIC;
    break;

    default:
      return CORE_LA64_GENERIC;
    break;
  }
 }

 int get_cputype(void) {
  uint32_t prid = get_prid();
  switch (prid & PRID_SERIES_MASK) {
    case (PRID_SERIES_LA264):
      return CPU_LA264;
    break;

    case (PRID_SERIES_LA364):
      return CPU_LA364;
    break;

    case (PRID_SERIES_LA464):
      return CPU_LA464;
    break;

    case (PRID_SERIES_LA664):
      return CPU_LA664;
    break;

    default:
      return CPU_LA64_GENERIC;
    break;
  }
 }

 char *get_corename(void) {
  return cpuname[detect()];
  return corename[get_coretype()];
 }

 void get_libname(void){
  printf("%s", corename_lower[get_coretype()]);
 }

 void get_architecture(void) {
@@ -86,8 +332,7 @@ void get_architecture(void) {
 }

 void get_subarchitecture(void) {
  int d = detect();
  printf("%s", cpuname[d]);
  printf("%s", cpuname[get_cputype()]);
 }

 void get_subdirname(void) {
@@ -95,50 +340,69 @@ void get_subdirname(void) {
 }

 void get_cpuconfig(void) {
  uint32_t hwcaps = 0;
  int d = detect();

  switch (d) {
    case CPU_LOONGSON3R5:
      printf("#define LOONGSON3R5\n");
      printf("#define L1_DATA_SIZE 65536\n");
      printf("#define L1_DATA_LINESIZE 64\n");
      printf("#define L2_SIZE 1048576\n");
      printf("#define L2_LINESIZE 64\n");
      printf("#define DTB_DEFAULT_ENTRIES 64\n");
      printf("#define DTB_SIZE 4096\n");
      printf("#define L2_ASSOCIATIVE 16\n");
    break;
  cache_info_t info;
  uint32_t num_cores = 0;

    case CPU_LOONGSON2K1000:
      printf("#define LOONGSON2K1000\n");
      printf("#define L1_DATA_SIZE 65536\n");
      printf("#define L1_DATA_LINESIZE 64\n");
      printf("#define L2_SIZE 262144\n");
      printf("#define L2_LINESIZE 64\n");
      printf("#define DTB_DEFAULT_ENTRIES 64\n");
      printf("#define DTB_SIZE 4096\n");
      printf("#define L2_ASSOCIATIVE 16\n");
    break;
  printf("#define %s\n", corename[get_coretype()]); // Core name

    default:
      printf("#define LOONGSONGENERIC\n");
      printf("#define L1_DATA_SIZE 65536\n");
      printf("#define L1_DATA_LINESIZE 64\n");
      printf("#define L2_SIZE 262144\n");
      printf("#define L2_LINESIZE 64\n");
      printf("#define DTB_DEFAULT_ENTRIES 64\n");
      printf("#define DTB_SIZE 4096\n");
      printf("#define L2_ASSOCIATIVE 16\n");
    break;
  printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name

  get_cacheinfo(CACHE_INFO_L1_IU, &info);
  if (info.present) {
    if (info.unify) { // Unified cache, without distinguishing between instructions and data
      printf("#define L1_SIZE %d\n", info.size);
      printf("#define L1_ASSOCIATIVE %d\n", info.associative);
      printf("#define L1_LINESIZE %d\n", info.linesize);
    } else {
      printf("#define L1_CODE_SIZE %d\n", info.size);
      printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative);
      printf("#define L1_CODE_LINESIZE %d\n", info.linesize);
    }
  }

  hwcaps = (uint32_t)getauxval( AT_HWCAP );
  if (hwcaps & LA_HWCAP_LSX)      printf("#define HAVE_LSX\n");
  if (hwcaps & LA_HWCAP_LASX)     printf("#define HAVE_LASX\n");
 }
  if (!info.unify) {
    get_cacheinfo(CACHE_INFO_L1_D, &info);
    if (info.present) {
      printf("#define L1_DATA_SIZE %d\n", info.size);
      printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative);
      printf("#define L1_DATA_LINESIZE %d\n", info.linesize);
    }
  }

 void get_libname(void){
  int d = detect();
  printf("%s", cpuname_lower[d]);
  get_cacheinfo(CACHE_INFO_L2_IU, &info);
  if (info.present > 0) {
    if (info.unify) {
      printf("#define L2_SIZE %d\n", info.size);
      printf("#define L2_ASSOCIATIVE %d\n", info.associative);
      printf("#define L2_LINESIZE %d\n", info.linesize);
    } else {
      printf("#define L2_CODE_SIZE %d\n", info.size);
      printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative);
      printf("#define L2_CODE_LINESIZE %d\n", info.linesize);
    }
  }

  get_cacheinfo(CACHE_INFO_L3_IU, &info);
  if (info.present > 0) {
    if (info.unify) {
      printf("#define L3_SIZE %d\n", info.size);
      printf("#define L3_ASSOCIATIVE %d\n", info.associative);
      printf("#define L3_LINESIZE %d\n", info.linesize);
    } else {
      printf("#define L3_CODE_SIZE %d\n", info.size);
      printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative);
      printf("#define L3_CODE_LINESIZE %d\n", info.linesize);
    }
  }

  if(os_support_lsx)  printf("#define HAVE_LSX\n");
  if(os_support_lasx) printf("#define HAVE_LASX\n");

  get_cpucount(&num_cores);
  if (num_cores)
    printf("#define NUM_CORES %d\n", num_cores);

  //TODO: It’s unclear what this entry represents, but it is indeed necessary.
  //It has been set based on reference to other platforms.
  printf("#define DTB_DEFAULT_ENTRIES 64\n");
 }
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1527,6 +1527,19 @@ int get_cpuname(void){
      break;
      case 10: //family 6 exmodel 10
        switch (model) {
 	  case 13: // Granite Rapids
 	  if(support_amx_bf16())
 	    return CPUTYPE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
            return CPUTYPE_COOPERLAKE;	
          if(support_avx512())
            return CPUTYPE_SKYLAKEX;
          if(support_avx2())
            return CPUTYPE_HASWELL;
          if(support_avx())
            return CPUTYPE_SANDYBRIDGE;
          else
          return CPUTYPE_NEHALEM;
          case 5: // Comet Lake H and S
          case 6: // Comet Lake U
 	  case 10: // Meteor Lake
@@ -1676,6 +1689,7 @@ int get_cpuname(void){
 	    return CPUTYPE_BARCELONA;
        }
      case 10: // Zen3/4
      case 11: // Zen5
 #ifndef NO_AVX512
          if(support_avx512_bf16())
            return CPUTYPE_COOPERLAKE;
@@ -2352,8 +2366,22 @@ int get_coretype(void){

      case 10:
        switch (model) {
 	  case 13: // Granite Rapids
 	  if(support_amx_bf16())
 	    return CORE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
            return CORE_COOPERLAKE;	
          if(support_avx512())
            return CORE_SKYLAKEX;
          if(support_avx2())
            return CORE_HASWELL;
          if(support_avx())
 	    return CORE_SANDYBRIDGE;
 	  else
 	    return CORE_NEHALEM;
 	  case 5: // Comet Lake H and S
    	  case 6: // Comet Lake U
 	  case 10: // Meteor Lake
            if(support_avx())
  #ifndef NO_AVX2
              return CORE_HASWELL;
@@ -2362,6 +2390,7 @@ int get_coretype(void){
  #endif
            else
              return CORE_NEHALEM;
 	  case 0: // Meteor Lake
 	  case 7:// Rocket Lake
 #ifndef NO_AVX512
 	  if(support_avx512())
@@ -2451,7 +2480,7 @@ int get_coretype(void){
 	  }
 	  break;
 	}
      } else if (exfamily == 8 || exfamily == 10) {
      } else if (exfamily == 8 || exfamily == 10 || exfamily == 11) {
 	switch (model) {
 	case 1:
 	  // AMD Ryzen
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -6,6 +6,10 @@ enable_language(Fortran)
 endif()

 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
 if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2)
       list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os)
       set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE)
 endif()
 if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
 	set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
 endif()
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -26,7 +26,7 @@ endif
 override CFLAGS += -DADD$(BU) -DCBLAS
 ifeq ($(F_COMPILER),GFORTRAN)
 ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
 	override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0
 	override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0
 endif
 	override FFLAGS += -fno-tree-vectorize
 endif
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -38,9 +38,12 @@
            CALL CHECK1(SFAC)
         END IF
 *        -- Print
         IF (PASS) WRITE (NOUT,99998)
         IF (PASS) THEN
            WRITE (NOUT,99998)
         ELSE
            ERROR STOP
        END IF
   20 CONTINUE
      STOP
 *
 99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@@ -228,7 +231,7 @@
               CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
               STOP
               ERROR STOP
            END IF
 *
   40    CONTINUE
@@ -512,7 +515,7 @@
               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
               STOP
               ERROR STOP
            END IF
 *
   40    CONTINUE
--- a/ctest/c_cblat2.f
+++ b/ctest/c_cblat2.f
@@ -10,7 +10,7 @@
 *  'CBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -243,7 +243,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
      STOP
      ERROR STOP
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@@ -283,7 +283,7 @@
      SAME = LCE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANS = 'T'
      CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@@ -291,7 +291,7 @@
      SAME = LCE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -418,7 +418,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_cblat3.f
+++ b/ctest/c_cblat3.f
@@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -194,7 +194,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
      STOP
      ERROR STOP
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@@ -237,7 +237,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -246,7 +246,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@@ -264,7 +264,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -273,7 +273,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -385,7 +385,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_cblat3_3m.f
+++ b/ctest/c_cblat3_3m.f
@@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -194,7 +194,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
      STOP
      ERROR STOP
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@@ -237,7 +237,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -246,7 +246,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@@ -264,7 +264,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -273,7 +273,7 @@
      SAME = LCE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -385,7 +385,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@@ -44,9 +44,12 @@
            CALL CHECK3(SFAC)
         END IF
 *        -- Print
         IF (PASS) WRITE (NOUT,99998)
         IF (PASS) THEN
            WRITE (NOUT,99998)
         ELSE
            ERROR STOP
        END IF
   20 CONTINUE
      STOP
 *
 99999 FORMAT (' Real CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@@ -136,7 +139,7 @@
            CALL STEST1(SS,DS1(K),DS1(K),SFAC)
         ELSE
            WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
            STOP
            ERROR STOP
         END IF
   20 CONTINUE
   40 RETURN
@@ -229,7 +232,7 @@
               CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
               STOP
               ERROR STOP
            END IF
   60    CONTINUE
   80 CONTINUE
@@ -384,7 +387,7 @@
               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
               STOP
               ERROR STOP
            END IF
  100    CONTINUE
  120 CONTINUE
@@ -472,7 +475,7 @@
   70          CONTINUE      
               ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
               STOP
               ERROR STOP
            END IF
   40    CONTINUE
   60 CONTINUE
--- a/ctest/c_dblat2.f
+++ b/ctest/c_dblat2.f
@@ -10,7 +10,7 @@
 *  'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -239,7 +239,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
      STOP
      ERROR STOP
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@@ -279,7 +279,7 @@
      SAME = LDE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANS = 'T'
      CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@@ -287,7 +287,7 @@
      SAME = LDE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -414,7 +414,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_dblat3.f
+++ b/ctest/c_dblat3.f
@@ -10,7 +10,7 @@
 *  'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -189,7 +189,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
      STOP
      ERROR STOP
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@@ -232,7 +232,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'T'
      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -241,7 +241,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@@ -259,7 +259,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'T'
      CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -268,7 +268,7 @@
      SAME = LDE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -379,7 +379,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_sblat1.f
+++ b/ctest/c_sblat1.f
@@ -44,9 +44,12 @@
            CALL CHECK3(SFAC)
         END IF
 *        -- Print
         IF (PASS) WRITE (NOUT,99998)
         IF (PASS) THEN
            WRITE (NOUT,99998)
         ELSE
            ERROR STOP
        END IF
   20 CONTINUE
      STOP
 *
 99999 FORMAT (' Real CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@@ -136,7 +139,7 @@
            CALL STEST1(SS,DS1(K),DS1(K),SFAC)
         ELSE
            WRITE (NOUT,*) ' Shouldn''t be here in CHECK0'
            STOP
            ERROR STOP
         END IF
   20 CONTINUE
   40 RETURN
@@ -229,7 +232,7 @@
               CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
               STOP
               ERROR STOP
            END IF
   60    CONTINUE
   80 CONTINUE
@@ -384,7 +387,7 @@
               CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
               STOP
               ERROR STOP
            END IF
  100    CONTINUE
  120 CONTINUE
@@ -479,7 +482,7 @@
   70          CONTINUE
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK3'
               STOP
               ERROR STOP
            END IF
   40    CONTINUE
   60 CONTINUE
@@ -759,4 +762,4 @@
         END IF
      END IF
      RETURN
      END
      END
--- a/ctest/c_sblat2.f
+++ b/ctest/c_sblat2.f
@@ -10,7 +10,7 @@
 *  'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -239,7 +239,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
      STOP
      ERROR STOP
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@@ -279,7 +279,7 @@
      SAME = LSE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANS = 'T'
      CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@@ -287,7 +287,7 @@
      SAME = LSE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -414,7 +414,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_sblat3.f
+++ b/ctest/c_sblat3.f
@@ -10,7 +10,7 @@
 *  'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -188,7 +188,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
      STOP
      ERROR STOP
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@@ -231,7 +231,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'T'
      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -240,7 +240,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@@ -258,7 +258,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'T'
      CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -267,7 +267,7 @@
      SAME = LSE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -378,7 +378,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_zblat1.f
+++ b/ctest/c_zblat1.f
@@ -38,9 +38,12 @@
            CALL CHECK1(SFAC)
         END IF
 *        -- Print
         IF (PASS) WRITE (NOUT,99998)
         IF (PASS) THEN
            WRITE (NOUT,99998)
         ELSE
            ERROR STOP
        END IF
   20 CONTINUE
      STOP
 *
 99999 FORMAT (' Complex CBLAS Test Program Results',/1X)
 99998 FORMAT ('                                    ----- PASS -----')
@@ -228,7 +231,7 @@
               CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1))
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK1'
               STOP
               ERROR STOP
            END IF
 *
   40    CONTINUE
@@ -512,7 +515,7 @@
               CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0)
            ELSE
               WRITE (NOUT,*) ' Shouldn''t be here in CHECK2'
               STOP
               ERROR STOP
            END IF
 *
   40    CONTINUE
--- a/ctest/c_zblat1c.c
+++ b/ctest/c_zblat1c.c
@@ -380,7 +380,7 @@ static doublereal c_b43 = 1.;
    static integer i__;
    extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
    static doublecomplex mwpcs[5], mwpct[5];
    extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
    extern /* Subroutine */ int zscaltest_(integer*, doublecomplex*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
    static doublecomplex cx[8];
    extern doublereal dznrm2test_(integer*, doublecomplex*, integer*);
    static integer np1;
@@ -595,7 +595,7 @@ static doublereal c_b43 = 1.;
    static integer ki;
    extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*);
    static integer kn;
    extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*);
    extern /* Subroutine */ int zaxpytest_(integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*);
    static doublecomplex cx[7], cy[7];
    static integer mx, my;

--- a/ctest/c_zblat2.f
+++ b/ctest/c_zblat2.f
@@ -10,7 +10,7 @@
 *  'CBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -243,7 +243,7 @@
     $      GO TO 70
   60 CONTINUE
      WRITE( NOUT, FMT = 9986 )SNAMET
      STOP
      ERROR STOP
   70 LTEST( I ) = LTESTT
      GO TO 50
 *
@@ -283,7 +283,7 @@
      SAME = LZE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANS = 'T'
      CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G,
@@ -291,7 +291,7 @@
      SAME = LZE( YY, YT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -418,7 +418,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_zblat3.f
+++ b/ctest/c_zblat3.f
@@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -195,7 +195,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
      STOP
      ERROR STOP
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@@ -238,7 +238,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -247,7 +247,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@@ -265,7 +265,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -274,7 +274,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -386,7 +386,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/c_zblat3_3m.f
+++ b/ctest/c_zblat3_3m.f
@@ -10,7 +10,7 @@
 *  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 *  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 *  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
 *  F        LOGICAL FLAG, T TO STOP ON FAILURES.
 *  F        LOGICAL FLAG, T TO ERROR STOP ON FAILURES.
 *  T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 *  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 *  16.0     THRESHOLD VALUE OF TEST RATIO
@@ -195,7 +195,7 @@
     $      GO TO 50
   40 CONTINUE
      WRITE( NOUT, FMT = 9990 )SNAMET
      STOP
      ERROR STOP
   50 LTEST( I ) = LTESTT
      GO TO 30
 *
@@ -238,7 +238,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -247,7 +247,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      DO 120 J = 1, N
         AB( J, NMAX + 1 ) = N - J + 1
@@ -265,7 +265,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
      TRANSB = 'C'
      CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
@@ -274,7 +274,7 @@
      SAME = LZE( CC, CT, N )
      IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
         WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
         STOP
         ERROR STOP
      END IF
 *
 *     Test each subroutine in turn.
@@ -386,7 +386,9 @@
      IF( TRACE )
     $   CLOSE ( NTRA )
      CLOSE ( NOUT )
      STOP
      IF( FATAL ) THEN
         ERROR STOP
      END IF
 *
 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
--- a/ctest/cblas_test.h
+++ b/ctest/cblas_test.h
@@ -10,6 +10,15 @@
 #define int long
 #endif

 #if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER)
 //#define LAPACK_COMPLEX_STRUCTURE
 #define NOCHANGE
 #endif
 /* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */
 #ifdef FAILED
 #undef FAILED
 #endif

 #define  TRUE           1
 #define  PASSED         1
 #define  TEST_ROW_MJR	1
--- a/docs/build_system.md
+++ b/docs/build_system.md
@@ -1,104 +1,122 @@
 This page describes the Make-based build, which is the default/authoritative
 build method. Note that the OpenBLAS repository also supports building with
 CMake (not described here) - that generally works and is tested, however there
 may be small differences between the Make and CMake builds.
 !!! info "Supported build systems"

    This page describes the Make-based build, which is the
    default/authoritative build method. Note that the OpenBLAS repository also
    supports building with CMake (not described here) - that generally works
    and is tested, however there may be small differences between the Make and
    CMake builds.


 ## Makefile dependency graph

 <!---
 An easy way to update this diagram is to copy it into https://mermaid.live
 and edit it interactively.
 -->

 ```mermaid
 flowchart LR
    A[Makefile] -->|included by many of the Makefiles in the subdirectories!| B(Makefile.system)
        B -->|triggered, not included, once by Makefile.system, and runs before any of the actual library code is built. builds and runs the 'getarch' tool for cpu identification, runs the compiler detection scripts c_check/f_check| C{Makefile.prebuild}
            C -->|either this or Makefile_kernel.conf is generated| D[Makefile.conf]
            C -->|temporary Makefile.conf during DYNAMIC_ARCH builds| E[Makefile_kernel.conf]
        B -->|defaults for build options that can be given on the make command line| F[Makefile.rule]
        B -->|architecture-specific compiler options and OpenBLAS buffer size values| G[Makefile.$ARCH]
    A --> exports
    A -->|directories: test, ctest, utest, cpp_thread_test| H(test directories)
    A --> I($BLASDIRS)
        I --> interface
        I --> driver/level2
        I --> driver/level3
        I --> driver/others
    A -->|for each target in DYNAMIC_CORE if DYNAMIC_ARCH=1| kernel
    A -->|subdirs: timing, testing, testing/EIG, testing/LIN| J($NETLIB_LAPACK_DIR)
    A --> relapack
 ```

 !!! warning
    This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself.

 ## Makefile dep graph
 ## Important Variables

 ```
 Makefile                                                        
 |                                                               
 |-----  Makefile.system # !!! this is included by many of the Makefiles in the subdirectories !!!
 |       |
 |       |=====  Makefile.prebuild # This is triggered (not included) once by Makefile.system 
 |       |       |                 # and runs before any of the actual library code is built.
 |       |       |                 # (builds and runs the "getarch" tool for cpu identification,
 |       |       |                 # runs the compiler detection scripts c_check and f_check) 
 |       |       |
 |       |       -----  (Makefile.conf) [ either this or Makefile_kernel.conf is generated ] 
 |       |       |                            { Makefile.system#L243 }
 |       |       -----  (Makefile_kernel.conf) [ temporary Makefile.conf during DYNAMIC_ARCH builds ]
 |       |
 |       |-----  Makefile.rule # defaults for build options that can be given on the make command line
 |       |
 |       |-----  Makefile.$(ARCH) # architecture-specific compiler options and OpenBLAS buffer size values
 |
 |~~~~~ exports/
 |
 |~~~~~ test/
 |
 |~~~~~ utest/  
 |
 |~~~~~ ctest/
 |
 |~~~~~ cpp_thread_test/
 |
 |~~~~~ kernel/
 |
 |~~~~~ ${SUBDIRS}
 |
 |~~~~~ ${BLASDIRS}
 |
 |~~~~~ ${NETLIB_LAPACK_DIR}{,/timing,/testing/{EIG,LIN}}
 |
 |~~~~~ relapack/
 ```
 Most of the tunable variables are found in
 [Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule),
 along with their detailed descriptions.

 ## Important Variables
 Most of the variables are detected automatically in
 [Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild),
 if they are not set in the environment.

 Most of the tunable variables are found in [Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule), along with their detailed descriptions.<br/>
 Most of the variables are detected automatically in [Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), if they are not set in the environment.
 The most commonly used variables are documented below. There are more options
 though - please read the linked Makefiles if you want to see all variables.

 ### CPU related
 ```
 ARCH         - Target architecture (eg. x86_64)
 TARGET       - Target CPU architecture, in case of DYNAMIC_ARCH=1 means library will not be usable on less capable CPUs
 TARGET_CORE  - TARGET_CORE will override TARGET internally during each cpu-specific cycle of the build for DYNAMIC_ARCH
 DYNAMIC_ARCH - For building library for multiple TARGETs (does not lose any optimizations, but increases library size)
 DYNAMIC_LIST - optional user-provided subset of the DYNAMIC_CORE list in Makefile.system
 ```

 ### Toolchain related
 ```
 CC                 - TARGET C compiler used for compilation (can be cross-toolchains)
 FC                 - TARGET Fortran compiler used for compilation (can be cross-toolchains, set NOFORTRAN=1 if used cross-toolchain has no fortran compiler)
 AR, AS, LD, RANLIB - TARGET toolchain helpers used for compilation (can be cross-toolchains)
 - `ARCH`: target architecture (e.g., `x86-64`).
 - `DYNAMIC_ARCH`: For building library for multiple `TARGET`s (does not lose any
  optimizations, but increases library size).
 - `DYNAMIC_LIST`: optional user-provided subset of the `DYNAMIC_CORE` list in
   [Makefile.system](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.system).
 - `TARGET`: target CPU architecture. In case of `DYNAMIC_ARCH=1`, it means that
  the library will not be usable on less capable CPUs.
 - `TARGET_CORE`: override `TARGET` internally during each CPU-specific cycle of
  the build for `DYNAMIC_ARCH`.

 HOSTCC             - compiler of build machine, needed to create proper config files for target architecture
 HOST_CFLAGS        - flags for build machine compiler
 ```

 ### Library related
 ```
 BINARY          - 32/64 bit library
 ### Toolchain related

 BUILD_SHARED    - Create shared library
 BUILD_STATIC    - Create static library
 - `CC`: `TARGET` C compiler used for compilation (can be cross-toolchains).
 - `FC`: `TARGET` Fortran compiler used for compilation (can be cross-toolchains,
  set `NOFORTRAN=1` if the used cross-toolchain has no Fortran compiler).
 - `COMMON_OPT`: flags to add to all invocations of the target C and Fortran compilers
  (overrides `CFLAGS`/`FFLAGS` - prefer using `COMMON_OPT`)
 - `CCOMMON_OPT`: flags to add to all invocations of the target C compiler
  (overrides `CFLAGS`)
 - `FCOMMON_OPT`: flags to add to all invocations of the target Fortran compiler
  (overrides `FFLAGS`)
 - `LDFLAGS`: flags to add to all target linker invocations
 - `AR`, `AS`, `LD`, `RANLIB`: `TARGET` toolchain helpers used for compilation
  (can be cross-toolchains).
 - `HOSTCC`: compiler of build machine, needed to create proper config files for
  the target architecture.
 - `HOST_CFLAGS`: flags for the build machine compiler.

 QUAD_PRECISION  - enable support for IEEE quad precision [ largely unimplemented leftover from GotoBLAS, do not use ]
 EXPRECISION     - Obsolete option to use float80 of SSE on BSD-like systems
 INTERFACE64     - Build with 64bit integer representations to support large array index values [ incompatible with standard API ]

 BUILD_SINGLE    - build the single-precision real functions of BLAS [and optionally LAPACK] 
 BUILD_DOUBLE    - build the double-precision real functions
 BUILD_COMPLEX   - build the single-precision complex functions
 BUILD_COMPLEX16 - build the double-precision complex functions
 (all four types are included in the build by default when none was specifically selected)
 ### Library related

 BUILD_BFLOAT16  - build the "half precision brainfloat" real functions 
 #### Library kind and bitness options

 - `BINARY`: whether to build a 32-bit or 64-bit library (default is `64`, set
  to `32` on a 32-bit platform).
 - `INTERFACE64`: build with 64-bit (ILP64) integer representations to support
  large array index values (incompatible with the standard 32-bit integer (LP64) API).
 - `NO_STATIC`: if set to `1`, don't build a static library (default is `0`)
 - `NO_SHARED`: if set to `1`, don't build a shared library (default is `0`)

 #### Data type options

 - `BUILD_SINGLE`: build the single-precision real functions of BLAS and (if
  it's built) LAPACK
 - `BUILD_DOUBLE`: build the double-precision real functions
 - `BUILD_COMPLEX`: build the single-precision complex functions
 - `BUILD_COMPLEX16`: build the double-precision complex functions
 - `BUILD_BFLOAT16`: build the "half precision brainfloat" real functions 
 - `EXPRECISION`: (do not use, this is a work in progress) option to use `long
  double` functions

 By default, the single- and double-precision real and complex floating-point
 functions are included in the build, while the half- and extended-precision
 functions are not.
 
 USE_THREAD      - Use a multithreading backend (default to pthread)
 USE_LOCKING     - implement locking for thread safety even when USE_THREAD is not set (so that the singlethreaded library can
                  safely be called from multithreaded programs)
 USE_OPENMP      - Use OpenMP as multithreading backend
 NUM_THREADS     - define this to the maximum number of parallel threads you expect to need (defaults to the number of cores in the build cpu)
 NUM_PARALLEL    - define this to the number of OpenMP instances that your code may use for parallel calls into OpenBLAS (default 1,see below)

 ```

 #### Threading options

 - `USE_THREAD`: Use a multithreading backend (defaults to `pthreads`).
 - `USE_LOCKING`: implement locking for thread safety even when `USE_THREAD` is
  not set (so that the single-threaded library can safely be called from
  multithreaded programs).
 - `USE_OPENMP`: Use OpenMP as multithreading backend
 - `NUM_THREADS`: define this to the maximum number of parallel threads you
  expect to need (defaults to the number of cores in the build CPU).
 - `NUM_PARALLEL`: define this to the number of OpenMP instances that your code
  may use for parallel calls into OpenBLAS (the default is `1`, see below).

 OpenBLAS uses a fixed set of memory buffers internally, used for communicating
 and compiling partial results from individual threads. For efficiency, the
@@ -118,3 +136,32 @@ same time, then only one of them will be able to make progress while all the
 rest of them spin-wait for the one available buffer. Setting `NUM_PARALLEL` to
 the upper bound on the number of OpenMP runtimes that you can have in a process
 ensures that there are a sufficient number of buffer sets available.

 #### Library and symbol name options

 - `FIXED_LIBNAME`: if set to `1`, uses a non-versioned name for the library and
  no symbolic linking to variant names (default is `0`)
 - `LIBNAMEPREFIX`: prefix that, if given, will be inserted in the library name
  before `openblas` (e.g., `xxx` will result in `libxxxopenblas.so`)
 - `LIBNAMESUFFIX`: suffix that, if given, will be inserted in the library name
  after `openblas`, separated by an underscore (e.g., `yyy` will result in
  `libopenblas_yyy.so`)
 - `SYMBOLPREFIX`: prefix that, if given, will be added to all symbol names
  *and* to the library name
 - `SYMBOLSUFFIX`: suffix that, if given, will be added to all symbol names
  *and* to the library name

 #### BLAS and LAPACK options

 By default, the Fortran and C interfaces to BLAS and LAPACK are built,
 including deprecated functions, while
 [ReLAPACK](https://github.com/HPAC/ReLAPACK) is not.

 - `NO_CBLAS`: if set to `1`, don't build the CBLAS interface (default is `0`)
 - `ONLY_CBLAS`: if set to `1`, only build the CBLAS interface (default is `0`)
 - `NO_LAPACK`: if set to `1`, don't build LAPACK (default is `0`)
 - `NO_LAPACKE`: if set to `1`, don't build the LAPACKE interface (default is `0`)
 - `BUILD_LAPACK_DEPRECATED`: if set to `0`, don't build deprecated LAPACK
  functions (default is `1`)
 - `BUILD_RELAPACK`: if set to `1`, build Recursive LAPACK on top of LAPACK
  (default is `0`)
--- a/docs/extensions.md
+++ b/docs/extensions.md
@@ -5,14 +5,14 @@ This page documents those non-standard APIs.

 ## BLAS-like extensions

 | Routine       | Data Types    | Description     |
 | ------------- |:------------- | :---------------|
 | ?axpby        | s,d,c,z       | like axpy with a multiplier for y    |       
 | ?gemm3m       | c,z           | gemm3m            |
 | ?imatcopy     | s,d,c,z       | in-place transpositon/copying   |
 | ?omatcopy     | s,d,c,z       | out-of-place transpositon/copying    |
 | ?geadd        | s,d,c,z       | matrix add   |
 | ?gemmt        | s,d,c,z       | gemm but only a triangular part updated|
 | Routine       | Data Types    | Description                                     |
 | ------------- |:------------- | :-----------------------------------------------|
 | ?axpby        | s,d,c,z       | like `axpy` with a multiplier for `y`           |
 | ?gemm3m       | c,z           | `gemm3m`                                        |
 | ?imatcopy     | s,d,c,z       | in-place transposition/copying                  |
 | ?omatcopy     | s,d,c,z       | out-of-place transposition/copying              |
 | ?geadd        | s,d,c,z       | ATLAS-like matrix add `B = &alpha;*A+&beta;*B`  |
 | ?gemmt        | s,d,c,z       | `gemm` but only a triangular part updated       |


 ## bfloat16 functionality
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -51,9 +51,9 @@ In practice, the values are derived by experimentation to yield the block sizes

 ### <a name="reportbug"></a>How can I report a bug?

 Please file an issue at this [issue page](https://github.com/xianyi/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users).
 Please file an issue at this [issue page](https://github.com/OpenMathLib/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users).

 Please provide the following information: CPU, OS, compiler, and OpenBLAS compiling flags (Makefile.rule). In addition, please describe how to reproduce this bug.
 Please provide the following information: CPU, OS, compiler, OpenBLAS version and any compiling flags you used (Makefile.rule). In addition, please describe how to reproduce this bug.

 ### <a name="publication"></a>How to reference OpenBLAS.

@@ -99,13 +99,13 @@ Here is the result of the DGEMM subroutine's performance on Intel Core i5-2500K

 ### <a name="MSVC"></a>How can I call an OpenBLAS function in Microsoft Visual Studio?

 Please read [this page](install.md#visual-studio).
 Please read [this page](install.md#visual-studio-native-windows-abi).

 ### <a name="C99_complex_number"></a>How can I use CBLAS and LAPACKE without C99 complex number support (e.g. in Visual Studio)?

 Zaheer has fixed this bug. You can now use the structure instead of C99 complex numbers. Please read [this issue page](http://github.com/xianyi/OpenBLAS/issues/95) for details.

 [This issue](https://github.com/xianyi/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio.
 [This issue](https://github.com/OpenMathLib/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio.

 ### <a name="Linux_SEGFAULT"></a>I get a SEGFAULT with multi-threading on Linux. What's wrong?

@@ -134,6 +134,13 @@ Background: OpenBLAS implements optimized versions of some LAPACK functions, so
 Some of the LAPACK tests, notably in xeigtstz, try to allocate around 10MB on the stack. You may need to use
 `ulimit -s` to change the default limits on your system to allow this.

 ### <a name="lapack_test"></a>My build worked fine and passed the BLAS tests, but running `make lapack-test` ends with a number of errors in the summary report

 The LAPACK tests were primarily created to test the validity of the Reference-LAPACK implementation, which is implemented in unoptimized, single-threaded Fortran code. This makes it very sensitive to small numerical deviations that can result from the use of specialized cpu instructions that combine multiplications and additions without intermediate rounding and storing to memory (FMA), or from changing the order of mathematical operations by splitting an original problem workload into smaller tasks that are solved in parallel. As a result, you may encounter a small number of errors in the "numerical" column of
 the summary table at the end of the `make lapack-test` run - this is usually nothing to worry about, and the exact number and distribution of errors among the
 four data types will often vary with the optimization flags you supplied to the compiler, or the cpu model for which you built OpenBLAS. Sporadic errors in the column labeled `other` are normally the sign of failed convergence of iterative diagonalizations for the same reasons just mentioned. A more detailed error report is stored in the file testing_results.txt - this should be consulted in case of doubt. Care should be taken if you encounter numerical errors in the hundreds, or `other` errors accompanied by the LAPACK error message "on entry to function_name parameter X had an illegal value" that signals a problem with argument passing between individual functions.
 (See also [this issue](https://github.com/OpenMathLib/OpenBLAS/issues/4032) in the issue tracker on github for additional discussion, examples and links)

 ### <a name="no_affinity"></a>How could I disable OpenBLAS threading affinity on runtime?

 You can define the OPENBLAS_MAIN_FREE or GOTOBLAS_MAIN_FREE environment variable to disable threading affinity on runtime. For example, before the running,
--- a/docs/install.md
+++ b/docs/install.md
@@ -437,49 +437,72 @@ To then use the built OpenBLAS shared library in Visual Studio:
      [Qt Creator](http://qt.nokia.com/products/developer-tools/).


 #### Windows on Arm

 The following tools needs to be installed to build for Windows on Arm (WoA):

 -   Clang for Windows on Arm.
    Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/).
    E.g: LLVM 12 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe)
    Run the LLVM installer and ensure that LLVM is added to environment PATH.
 -   Download and install classic Flang for Windows on Arm.
    Classic Flang is the only available Fortran compiler for Windows on Arm for now.
    A pre-release build can be found [here](https://github.com/kaadam/flang/releases/tag/v0.1)
    There is no installer for classic flang and the zip package can be
    extracted and the path needs to be added to environment `PATH`.
    E.g., in PowerShell:
    ```
    $env:Path += ";C:\flang_woa\bin"
    ```

 The following steps describe how to build the static library for OpenBLAS with and without LAPACK:

 1.  Build OpenBLAS static library with BLAS and LAPACK routines with Make:

    ```bash
    $ make CC="clang-cl" HOSTCC="clang-cl" AR="llvm-ar" BUILD_WITHOUT_LAPACK=0 NOFORTRAN=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 USE_OPENMP=0 PARALLEL=1 RANLIB="llvm-ranlib" MAKE=make F_COMPILER=FLANG FC=FLANG FFLAGS_NOOPT="-march=armv8-a -cpp" FFLAGS="-march=armv8-a -cpp" NEED_PIC=0 HOSTARCH=arm64 libs netlib
    ```

 2.  Build static library with BLAS routines using CMake:

    Classic Flang has compatibility issues with CMake, hence only BLAS routines can be compiled with CMake:

    ```bash
    $ mkdir build
    $ cd build
    $ cmake ..  -G Ninja -DCMAKE_C_COMPILER=clang -DBUILD_WITHOUT_LAPACK=1 -DNOFORTRAN=1 -DDYNAMIC_ARCH=0 -DTARGET=ARMV8 -DARCH=arm64 -DBINARY=64 -DUSE_OPENMP=0 -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_CROSSCOMPILING=1 -DCMAKE_SYSTEM_NAME=Windows
    $ cmake --build . --config Release
    ```

 !!! tip "`getarch.exe` execution error"

    If you notice that platform-specific headers by `getarch.exe` are not
    generated correctly, this could be due to a known debug runtime DLL issue for
    arm64 platforms. Please check out [this page](https://linaro.atlassian.net/wiki/spaces/WOAR/pages/28677636097/Debug+run-time+DLL+issue#Workaround)
    for a workaround.
 ### Windows on Arm

 A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool).
 (Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using
 the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own)

 1. Clone OpenBLAS to your local machine and checkout to latest release of
   OpenBLAS (unless you want to build the latest development snapshot - here we
   are using  the 0.3.28 release as the example, of course this exact version
   may be outdated by the time you read this)
  
       ```cmd
       git clone https://github.com/OpenMathLib/OpenBLAS.git
       cd OpenBLAS
       git checkout v0.3.28
       ```
  
 2. Install Latest LLVM toolchain for WoA:

       Download the Latest LLVM toolchain for WoA from [the Release
       page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At
       the time of writing, this is version 19.1.5 - be sure to select the
       latest release for which you can find a precompiled package whose name ends
       in "-woa64.exe" (precompiled packages usually lag a week or two behind their
       corresponding source release).  Make sure to enable the option
       *“Add LLVM to the system PATH for all the users”*.

       Note: Make sure that the path of LLVM toolchain is at the top of Environment
       Variables section to avoid conflicts between the set of compilers available
       in the system path

 3. Launch the Native Command Prompt for Windows ARM64:

       From the start menu search for *"ARM64 Native Tools Command Prompt for Visual
       Studio 2022"*. Alternatively open command prompt, run the following command to
       activate the environment:

       ```cmd
       C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat
       ```

 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS
   by invoking Ninja:
   
       ```cmd
       cd OpenBLAS
       mkdir build
       cd build
       
       cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new

       ninja -j16
       ```
       
       Note: You might want to include additional options in the cmake command
       here. For example, the default configuration only generates a
       `static.lib` version of the library. If you prefer a DLL, you can add
       `-DBUILD_SHARED_LIBS=ON`.

       Note that it is also possible to use the same setup to build OpenBLAS
       with Make, if you prefer Makefiles over the CMake build for some
       reason:

       ```cmd
       $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make
       ```


 #### Generating an import library
@@ -501,7 +524,7 @@ In your shell, move to this directory: `cd exports`.
    incompatibility in the C ABI would be a bug).

    The import libraries of MSVC have the suffix `.lib`. They are generated
    from a `.def` file using MSVC's `lib.exe`. See [the MSVC instructions](use_visual_studio.md#generate-import-library-before-0210-version).
    from a `.def` file using MSVC's `lib.exe`.

 === "MinGW"

@@ -532,7 +555,6 @@ In your shell, move to this directory: `cd exports`.
 To build OpenBLAS for Android, you will need the following tools installed on your machine:

 - [The Android NDK](https://developer.android.com/ndk/)
 - Perl
 - Clang compiler on the build machine

 The next two sections below describe how to build with Clang for ARMV7 and
@@ -574,7 +596,9 @@ utility in the make command above, like so:
 AR=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-gcc-ar
 ```
 otherwise you may get a linker error complaining like `malformed archive header
 name at 8` when the native macOS `ar` command was invoked instead.
 name at 8` when the native macOS `ar` command was invoked instead. Note that
 with recent NDK versions, the AR tool may be named `llvm-ar` rather than what
 is assumed above.

 
 #### Building for ARMV8
@@ -604,12 +628,17 @@ Note: for NDK 23b, something as simple as:
 export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH
 make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8
 ```
 appears to be sufficient on Linux.
 appears to be sufficient on Linux. On OSX, setting AR to the ar provided in the
 "bin" path of the NDK (probably `llvm-ar`) is also necessary.


 ??? note "Alternative build script for 3 architectures"

    This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, `X86`) and install them to `/opt/OpenBLAS/lib`.
    This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`,
    `X86`) and install them to `/opt/OpenBLAS/lib`. Of course you can also copy
    only the section that is of interest to you - also notice that the `AR=`
    line may need adapting to the name of the ar tool provided in your
    `$TOOLCHAIN/bin` - for example `llvm-ar` in some recent NDK versions.
    It was tested on macOS with NDK version 21.3.6528147.

    ```bash
@@ -680,6 +709,40 @@ make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
 Adjust `MIN_IOS_VERSION` as necessary for your installation. E.g., change the version number
 to the minimum iOS version you want to target and execute this file to build the library.

 ### HarmonyOS

 For this target you will need the cross-compiler toolchain package by Huawei,
 which contains solutions for both Windows and Linux. Only the Linux-based
 toolchain has been tested so far, but the following instructions may apply
 similarly to Windows:

 Download [this HarmonyOS 4.1.1 SDK](https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz),
 or whatever newer version may be available in the future). Use `tar -xvf
 ohos-sdk-windows_linux_public.tar.gz` to unpack it somewhere on your system.
 This will create a folder named "ohos-sdk" with subfolders "linux" and
 "windows". In the linux one you will find a ZIP archive named
 `native-linux-x64-4.1.7.8-Release.zip` - you need to unzip this where you want
 to install the cross-compiler, for example in `/opt/ohos-sdk`.

 In the directory where you unpacked OpenBLAS, create a build directory for cmake, and change into it :
 ```bash
 mkdir build
 cd build
 ```
 Use the version of `cmake` that came with the SDK, and specify the location of
 its toolchain file as a cmake option. Also set the build target for OpenBLAS to
 `ARMV8` and specify `NOFORTRAN=1` (at least as of version 4.1.1, the SDK
 contains no Fortran compiler):
 ```bash
 /opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake \
      -DCMAKE_TOOLCHAIN_FILE=/opt/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \
      -DOHOS_ARCH="arm64-v8a" -DTARGET=ARMV8 -DNOFORTRAN=1 ..
 ```
 Additional other OpenBLAS build options like `USE_OPENMP=1` or `DYNAMIC_ARCH=1`
 will probably work too. Finally do the build:
 ```bash
 /opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake --build .
 ```

 ### MIPS

--- a/docs/runtime_variables.md
+++ b/docs/runtime_variables.md
@@ -0,0 +1,38 @@
 OpenBLAS checks the following environment variables on startup:

 * `OPENBLAS_NUM_THREADS`: the number of threads to use (for non-OpenMP builds
  of OpenBLAS)
 * `OMP_NUM_THREADS`: the number of threads to use (for OpenMP builds - note
  that setting this may also affect any other OpenMP code)
 * `OPENBLAS_DEFAULT_NUM_THREADS`: the number of threads to use, irrespective if
  OpenBLAS was built for OpenMP or pthreads

 * `OPENBLAS_MAIN_FREE=1`: this can be used to disable automatic assignment of
  cpu affinity in OpenBLAS builds that have it enabled by default
 * `OPENBLAS_THREAD_TIMEOUT`: this can be used to define the length of time
  that idle threads should wait before exiting
 * `OMP_ADAPTIVE=1`: this can be used in OpenMP builds to actually remove any
  surplus threads when the number of threads is decreased


 `DYNAMIC_ARCH` builds also accept the following:

 * `OPENBLAS_VERBOSE`:

    - set this to `1` to enable a warning when there is no exact match for the
      detected cpu in the library
    - set this to `2` to make OpenBLAS print the name of the cpu target it
      autodetected

 * `OPENBLAS_CORETYPE`: set this to one of the supported target names to
  override autodetection, e.g., `OPENBLAS_CORETYPE=HASWELL`
 * `OPENBLAS_L2_SIZE`: set this to override the autodetected size of the L2
  cache where it is not reported correctly (in virtual environments)


 Deprecated variables still recognized for compatibilty:

 * `GOTO_NUM_THREADS`: equivalent to `OPENBLAS_NUM_THREADS`
 * `GOTOBLAS_MAIN_FREE`: equivalent to `OPENBLAS_MAIN_FREE`
 * `OPENBLAS_BLOCK_FACTOR`: this applies a scale factor to the GEMM "P"
  parameter of the block matrix code, see file `driver/others/parameter.c`
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
    num_parts  = 0;
    while (n > 0){
      width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
      if (width < switch_ratio) {
      if (width < switch_ratio && width > 1) {
        width = switch_ratio;
      }
      width = round_up(n, width, GEMM_PREFERED_SIZE);
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -54,6 +54,8 @@ if (DYNAMIC_ARCH)
    list(APPEND COMMON_SOURCES dynamic_power.c)
  elseif (RISCV64)
    list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
  elseif (LOONGARCH64)
    list(APPEND COMMON_SOURCES dynamic_loongarch64.c)
  else ()  
    list(APPEND COMMON_SOURCES dynamic.c)
  endif ()  
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -1076,7 +1076,13 @@ fprintf(STDERR, "Server[%2ld] Calculation started.  Mode = 0x%03x M = %3ld N=%3l
      main_status[cpu] = MAIN_RUNNING1;
 #endif

 //For target LOONGSON3R5, applying an offset to the buffer is essential
 if (buffer == NULL) {
 	blas_thread_buffer[cpu] = blas_memory_alloc(2);
 	buffer = blas_thread_buffer[cpu];
 }      

 	
 //For LOONGARCH64, applying an offset to the buffer is essential
 //for minimizing cache conflicts and optimizing performance.
 #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
      if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -114,9 +114,11 @@ void goto_set_num_threads(int num_threads) {

  adjust_thread_buffers();
 #if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64)
 #ifndef DYNAMIC_ARCH
  //set parameters for different number of threads.
  blas_set_parameter();
 #endif
 #endif

 }
 void openblas_set_num_threads(int num_threads) {
@@ -124,6 +126,18 @@ void openblas_set_num_threads(int num_threads) {
 	goto_set_num_threads(num_threads);
 }

 #ifdef OS_LINUX

 int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
  fprintf(stderr,"OpenBLAS: use OpenMP environment variables for setting cpu affinity\n");
  return -1;
 }
 int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
  fprintf(stderr,"OpenBLAS: use OpenMP environment variables for querying cpu affinity\n");
  return -1;
 }
 #endif

 int blas_thread_init(void){

 #if defined(__FreeBSD__) && defined(__clang__)
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -271,22 +271,59 @@ static gotoblas_t *get_coretype(void) {

  if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
 #ifdef __linux
 	int i;
        int ncores=0;
 	int prt,cpucap,cpulowperf=0,cpumidperf=0,cpuhiperf=0;
        FILE *infile;
        char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
        p = (char *) NULL ;
 	infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
 	if (!infile) return NULL;
 	(void)fgets(buffer, sizeof(buffer), infile);
 	midr_el1=strtoul(buffer,NULL,16);
 	fclose(infile);
 #else
        char buffer[512], *cpu_part = NULL, *cpu_implementer = NULL;

 	infile = fopen("/sys/devices/system/cpu/possible","r");
 	if (infile) {
 		(void)fgets(buffer, sizeof(buffer), infile);
 		sscanf(buffer,"0-%d",&ncores);
 		fclose (infile);
 		ncores++;
 	} else {
 		infile = fopen("/proc/cpuinfo","r");
                while (fgets(buffer, sizeof(buffer), infile)) {
                        if (!strncmp("processor", buffer, 9))
                        ncores++;
 		}
 	}
 	for (i=0;i<ncores;i++) {
 		sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i);
 		infile = fopen(buffer,"r");
 		if (!infile) return NULL;
 		(void)fgets(buffer, sizeof(buffer), infile);
 		midr_el1=strtoul(buffer,NULL,16);
  		implementer = (midr_el1 >> 24) & 0xFF;
  		prt        = (midr_el1 >> 4)  & 0xFFF;
 		fclose(infile);
 		sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capability",i);
 		infile = fopen(buffer,"r");
 		if (infile) {
 			(void)fgets(buffer, sizeof(buffer), infile);
 			cpucap=strtoul(buffer,NULL,16);
 			fclose(infile);
 			if (cpucap >= 1000) cpuhiperf++;
 			else if (cpucap >=500) cpumidperf++;
 			else cpulowperf++;
 			if (cpucap >=1000) part = prt;
 		} else if (implementer == 0x41 ){
 			if (prt >= 0xd4b) cpuhiperf++;
 			else if (prt>= 0xd07) cpumidperf++;
 			else cpulowperf++;
 		} else cpulowperf++;
 	}
 	if (!part) part = prt;
 #else	
    snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
    openblas_warning(1, coremsg);
    return NULL;
 #endif
  } else {
    get_cpu_ftr(MIDR_EL1, midr_el1);
  }
  
  /*
   * MIDR_EL1
   *
@@ -297,7 +334,7 @@ static gotoblas_t *get_coretype(void) {
   */
  implementer = (midr_el1 >> 24) & 0xFF;
  part        = (midr_el1 >> 4)  & 0xFFF;

  }
  switch(implementer)
  {
    case 0x41: // ARM
--- a/driver/others/dynamic_loongarch64.c
+++ b/driver/others/dynamic_loongarch64.c
@@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/auxv.h>
 #include "common.h"

 extern gotoblas_t  gotoblas_LOONGSON3R5;
 extern gotoblas_t  gotoblas_LOONGSON2K1000;
 extern gotoblas_t  gotoblas_LOONGSONGENERIC;
 #define NUM_CORETYPES       6
 #define LOONGARCH_CFG0      0x00
 #define LA_HWCAP_LSX        (1U << 4)
 #define LA_HWCAP_LASX       (1U << 5)
 #define PRID_SERIES_MASK    0xf000
 #define PRID_SERIES_LA264   0xa000
 #define PRID_SERIES_LA364   0xb000
 #define PRID_SERIES_LA464   0xc000
 #define PRID_SERIES_LA664   0xd000

 extern gotoblas_t  gotoblas_LA64_GENERIC;
 extern gotoblas_t  gotoblas_LA264;
 extern gotoblas_t  gotoblas_LA464;

 extern void openblas_warning(int verbose, const char * msg);

 #define NUM_CORETYPES    3

 static char *corename[] = {
  "loongson3r5",
  "loongson2k1000",
  "la64_generic",
  "la264",
  "la464",
  "loongsongeneric",
  "loongson2k1000",
  "loongson3r5",
  "unknown"
 };

 char *gotoblas_corename(void) {
  if (gotoblas == &gotoblas_LOONGSON3R5)     return corename[0];
  if (gotoblas == &gotoblas_LOONGSON2K1000)  return corename[1];
  if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2];
  if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0];
  if (gotoblas == &gotoblas_LA264)        return corename[1];
  if (gotoblas == &gotoblas_LA464)        return corename[2];
  return corename[NUM_CORETYPES];
 }

@@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) {

  switch (found)
  {
    case  0: return (&gotoblas_LOONGSON3R5);
    case  1: return (&gotoblas_LOONGSON2K1000);
    case  2: return (&gotoblas_LOONGSONGENERIC);
    case  0: return (&gotoblas_LA64_GENERIC);
    case  1: return (&gotoblas_LA264);
    case  2: return (&gotoblas_LA464);
    case  3: return (&gotoblas_LA64_GENERIC);
    case  4: return (&gotoblas_LA264);
    case  5: return (&gotoblas_LA464);
  }
  snprintf(message, 128, "Core not found: %s\n", coretype);
  openblas_warning(1, message);
  return NULL;
 }

 #define LA_HWCAP_LSX    (1U << 4)
 #define LA_HWCAP_LASX   (1U << 5)

 static gotoblas_t *get_coretype(void) {
  int hwcap = (int)getauxval(AT_HWCAP);
 /* Detect whether the OS supports the LASX instruction set */
 static int os_support_lasx() {
  int hwcap  = (int)getauxval(AT_HWCAP);

  if (hwcap & LA_HWCAP_LASX)
    return &gotoblas_LOONGSON3R5;
  else if (hwcap & LA_HWCAP_LSX)
    return &gotoblas_LOONGSON2K1000;
    return 1;
  else
    return 0;
 }

 /* Detect whether the OS supports the LSX instruction set */
 static int os_support_lsx() {
  int hwcap  = (int)getauxval(AT_HWCAP);

  if (hwcap & LA_HWCAP_LSX)
    return 1;
  else
    return &gotoblas_LOONGSONGENERIC;
    return 0;
 }

 static uint32_t get_prid() {
  uint32_t reg = 0;
  __asm__ volatile (
    "cpucfg %0, %1 \n\t"
    : "+&r"(reg)
    : "r"(LOONGARCH_CFG0)
  );
  return reg;
 }

 /* Select core at runtime based on the
 * cpu name and SIMD instructions supported
 * by the system
 */
 static gotoblas_t *get_coretype(void) {
  uint32_t prid = get_prid();
  switch (prid & PRID_SERIES_MASK) {
    case (PRID_SERIES_LA464):
    case (PRID_SERIES_LA664):
      if (os_support_lasx())
        return &gotoblas_LA464;
      else if (os_support_lsx())
        return &gotoblas_LA264;
      else
        return &gotoblas_LA64_GENERIC;
    break;

    case (PRID_SERIES_LA264):
    case (PRID_SERIES_LA364):
      if (os_support_lsx())
        return &gotoblas_LA264;
      else
        return &gotoblas_LA64_GENERIC;
    break;

    default:
      return &gotoblas_LA64_GENERIC;
    break;
  }
 }

 void gotoblas_dynamic_init(void) {
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2538,7 +2538,7 @@ static void *alloc_shm(void *address){
 }
 #endif

 #if defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS
 #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))

 static void alloc_hugetlb_free(struct release_t *release){

@@ -3254,7 +3254,7 @@ void blas_shutdown(void){
 #endif
      newmemory[pos].lock   = 0;
    }
    free(newmemory);
    free((void*)newmemory);
    newmemory = NULL;
    memory_overflowed = 0;  
  }
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -752,7 +752,7 @@ int get_L3_size() {
 }

 void blas_set_parameter(void){
 #if defined(LOONGSON3R5)
 #if defined(LA464)
  int L3_size = get_L3_size();
 #ifdef SMP
  if(blas_num_threads == 1){
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -21,7 +21,7 @@ blasobjsc="
    chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
    chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
    csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt cgemmtr"

 blasobjsd="
    damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
@@ -29,7 +29,7 @@ blasobjsd="
    dscal dsdot dspmv dspr2 dimatcopy domatcopy
    dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
    dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
        idamax idamin idmax idmin dgeadd dsum dgemmt"
        idamax idamin idmax idmin dgeadd dsum dgemmt dgemmtr"

 blasobjss="
    isamax isamin ismax ismin
@@ -38,7 +38,7 @@ blasobjss="
    smax smin snrm2 simatcopy somatcopy
    srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
    ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
    strmm strmv strsm strsv  sgeadd ssum sgemmt"
    strmm strmv strsm strsv  sgeadd ssum sgemmt sgemmtr"

 blasobjsz="
    izamax izamin
@@ -48,17 +48,17 @@ blasobjsz="
    zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
    ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
    zomatcopy  zimatcopy dzamax dzamin dzasum dznrm2
    zgeadd  dzsum zgemmt"
    zgeadd  dzsum zgemmt zgemmtr"

 blasobjs="lsame xerbla"
 bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
 bfblasobjs="sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
 cblasobjsc="
    cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
    cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k
    cblas_cher cblas_cherk  cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
    cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
    cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
    cblas_scnrm2 cblas_scasum cblas_cgemmt
    cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_cgemmtr
    cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
    cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin cblas_cgemm_batch
    "
@@ -68,7 +68,7 @@ cblasobjsd="
    cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
    cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
    cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_dgemmtr
    cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
    cblas_damax  cblas_damin cblas_dgemm_batch
    "
@@ -80,7 +80,7 @@ cblasobjss="
    cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
    cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
    cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
    cblas_strsv cblas_sgeadd cblas_sgemmt
    cblas_strsv cblas_sgeadd cblas_sgemmt cblas_sgemmtr
    cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
    cblas_samax cblas_samin cblas_sgemm_batch
    "
@@ -92,7 +92,7 @@ cblasobjsz="
    cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
    cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
    cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
    cblas_zaxpby cblas_zgeadd cblas_zgemmt
    cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_zgemmtr
    cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
    cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin cblas_zgemm_batch
 "
@@ -869,8 +869,12 @@ lapackobjs2z="$lapackobjs2z
 #functions added post 3.11

 lapackobjs2c="$lapackobjs2c
    cgelst
    cgeqp3rk
    claqp2rk
    claqp3rk
    clatrs3
    crscl
    ctrsyl3
    "
 #    claqz0
@@ -880,10 +884,8 @@ lapackobjs2c="$lapackobjs2c
 #    clatrs3

 lapackobjs2d="$lapackobjs2d
    dgelqs
    dgelst
    dgeqp3rk
    dgeqrs
    dlaqp2rk
    dlaqp3rk
    dlarmm
@@ -896,11 +898,19 @@ lapackobjs2d="$lapackobjs2d
 #    dlaqz3
 #    dlaqz4

 lapackobjs2s="$lapackobjs2s
    sgelst
    sgeqp3rk
    slaqp2rk
    slaqp3rk
    slarmm
    slatrs3
    strsyl3
    "

 lapackobjs2z="$lapackobjs2z
    zgelqs
    zgelst
    zgeqp3rk
    zgeqrs
    zlaqp2rk
    zlaqp3rk
    zlatrs3
@@ -918,6 +928,7 @@ lapack_extendedprecision_objs="
 "

 lapack_deprecated_objsc="
    cgelqs  cgeqrs
    cgegs   cggsvd
    cgegv   cggsvp
    cgelsx  clahrd
@@ -926,6 +937,7 @@ lapack_deprecated_objsc="
    "

 lapack_deprecated_objsd="
    dgelqs  dgeqrs
    dgegs   dgeqpf
    dgegv   dggsvd
    dgelsx  dggsvp
@@ -933,6 +945,8 @@ lapack_deprecated_objsd="
  dlatzm  dtzrqf"

 lapack_deprecated_objss="
  sgelqs
  sgeqrs
  sgelsx
  sgegs
  sgegv
@@ -945,6 +959,8 @@ lapack_deprecated_objss="
  "

 lapack_deprecated_objsz="
  zgelqs
  zgeqrs
  zgegs
  zgegv
  zgelsx
--- a/getarch.c
+++ b/getarch.c
@@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_CELL		*/
 /* #define FORCE_MIPS64_GENERIC	*/
 /* #define FORCE_SICORTEX	*/
 /* #define FORCE_LOONGSON3R3     */
 /* #define FORCE_LOONGSON3R4     */
 /* #define FORCE_LOONGSON3R3    */
 /* #define FORCE_LOONGSON3R4    */
 /* #define FORCE_LOONGSON3R5     */
 /* #define FORCE_LOONGSON2K1000  */
 /* #define FORCE_LOONGSONGENERIC */
 /* #define FORCE_LA64_GENERIC   */
 /* #define FORCE_LA264          */
 /* #define FORCE_LA464          */
 /* #define FORCE_I6400		*/
 /* #define FORCE_P6600		*/
 /* #define FORCE_P5600		*/
@@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_EV5		*/
 /* #define FORCE_EV6		*/
 /* #define FORCE_CSKY		*/
 /* #define FORCE_CK860FV		*/
 /* #define FORCE_CK860FV        */
 /* #define FORCE_GENERIC	*/

 #ifdef FORCE_P2
@@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif

 #ifdef FORCE_LOONGSON3R5
 #if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5)
 #define FORCE
 #define ARCHITECTURE    "LOONGARCH"
 #define SUBARCHITECTURE "LOONGSON3R5"
 #ifdef NO_LASX
 #ifdef NO_LSX
 #define SUBARCHITECTURE "LA64_GENERIC"
 #define SUBDIRNAME      "loongarch64"
 #define ARCHCONFIG   "-DLOONGSON3R5 " \
 #define ARCHCONFIG   "-DLA64_GENERIC " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
 #define LIBNAME   "loongson3r5"
 #define CORENAME  "LOONGSON3R5"
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 "
 #define LIBNAME   "la64_generic"
 #define CORENAME  "LA64_GENERIC"
 #else
 #define SUBARCHITECTURE "LA264"
 #define SUBDIRNAME      "loongarch64"
 #define ARCHCONFIG   "-DLA264 " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 "
 #define LIBNAME   "la264"
 #define CORENAME  "LA264"
 #endif
 #else
 #define SUBARCHITECTURE "LA464"
 #define SUBDIRNAME      "loongarch64"
 #define ARCHCONFIG   "-DLA464 " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 "
 #define LIBNAME   "la464"
 #define CORENAME  "LA464"
 #endif
 #endif

 #ifdef FORCE_LOONGSON2K1000
 #if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000)
 #define FORCE
 #define ARCHITECTURE    "LOONGARCH"
 #define SUBARCHITECTURE "LOONGSON2K1000"
 #ifdef NO_LSX
 #define SUBARCHITECTURE "LA64_GENERIC"
 #define SUBDIRNAME      "loongarch64"
 #define ARCHCONFIG   "-DLOONGSON2K1000 " \
 #define ARCHCONFIG   "-DLA64_GENERIC " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
 #define LIBNAME   "loongson2k1000"
 #define CORENAME  "LOONGSON2K1000"
       "-DDTB_DEFAULT_ENTRIES=64 "
 #define LIBNAME   "la64_generic"
 #define CORENAME  "LA64_GENERIC"
 #else
 #define SUBARCHITECTURE "LA264"
 #define SUBDIRNAME      "loongarch64"
 #define ARCHCONFIG   "-DLA264 " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 "
 #define LIBNAME   "la264"
 #define CORENAME  "LA264"
 #endif
 #endif

 #ifdef FORCE_LOONGSONGENERIC
 #if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC)
 #define FORCE
 #define ARCHITECTURE    "LOONGARCH"
 #define SUBARCHITECTURE "LOONGSONGENERIC"
 #define SUBARCHITECTURE "LA64_GENERIC"
 #define SUBDIRNAME      "loongarch64"
 #define ARCHCONFIG   "-DLOONGSONGENERIC " \
 #define ARCHCONFIG   "-DLA64_GENERIC " \
       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
 #define LIBNAME   "loongsongeneric"
 #define CORENAME  "LOONGSONGENERIC"
 #else
       "-DDTB_DEFAULT_ENTRIES=64 "
 #define LIBNAME   "la64_generic"
 #define CORENAME  "LA64_GENERIC"
 #endif

 #ifdef FORCE_I6400
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -107,6 +107,9 @@ endif ()

  # trmm is trsm with a compiler flag set
  GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG})
  
  # gemmtr is gemmt under the name adopted by the Reference BLAS
  GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG})

  # max and imax are compiled 4 times
  GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG})
@@ -123,6 +126,7 @@ if (BUILD_BFLOAT16)
 	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -44,12 +44,12 @@ SBLAS3OBJS    = \
 		sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
 		strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
 		somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
 		sgeadd.$(SUFFIX) sgemmt.$(SUFFIX)
 		sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) sgemmtr.$(SUFFIX)

 ifeq ($(BUILD_BFLOAT16),1)
 SBBLAS1OBJS    = sbdot.$(SUFFIX)
 SBBLAS2OBJS    = sbgemv.$(SUFFIX)
 SBBLAS3OBJS    = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX)
 SBBLAS3OBJS    = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX)
 SBEXTOBJS      = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
 endif

@@ -76,7 +76,7 @@ DBLAS3OBJS    = \
 		dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
 		dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
 		domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
 		dgeadd.$(SUFFIX) dgemmt.$(SUFFIX)
 		dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) dgemmtr.$(SUFFIX)

 CBLAS1OBJS    = \
 		caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
@@ -105,7 +105,7 @@ CBLAS3OBJS    = \
 		ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
 	       	chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
 		comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
 		cgeadd.$(SUFFIX) cgemmt.$(SUFFIX)
 		cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) cgemmtr.$(SUFFIX)

 ZBLAS1OBJS    = \
 		zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
@@ -134,7 +134,7 @@ ZBLAS3OBJS    = \
 		ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
 	       	zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
 		zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
 		zgeadd.$(SUFFIX) zgemmt.$(SUFFIX)
 		zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) zgemmtr.$(SUFFIX)

 ifeq ($(SUPPORT_GEMM3M), 1)

@@ -282,12 +282,12 @@ CSBLAS2OBJS   = \
 CSBLAS3OBJS   = \
 	cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
 	cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX)  cblas_simatcopy.$(SUFFIX)\
 	cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX)
 	cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemmtr.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX)

 ifeq ($(BUILD_BFLOAT16),1)
 CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
 CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
 CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX)
 CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX)
 CSBEXTOBJS   = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
 endif

@@ -308,7 +308,7 @@ CDBLAS2OBJS   = \
 CDBLAS3OBJS   += \
 	cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
 	cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX)  cblas_dimatcopy.$(SUFFIX) \
        cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX)
        cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemmtr.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX)

 CCBLAS1OBJS   = \
 	cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX)  cblas_caxpy.$(SUFFIX) \
@@ -333,7 +333,7 @@ CCBLAS3OBJS   = \
 	cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
 	cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
 	cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
 	cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX)
 	cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemmtr.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX)
 	
 CXERBLAOBJ = \
 	cblas_xerbla.$(SUFFIX)
@@ -364,7 +364,7 @@ CZBLAS3OBJS   = \
 	cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
 	cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
 	cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
 	cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX)
 	cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemmtr.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX)


 ifeq ($(SUPPORT_GEMM3M), 1)
@@ -1305,6 +1305,8 @@ sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif

 sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
@@ -1340,7 +1342,19 @@ cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h
 zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)

 xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h
 sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)

 dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)

 qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)

 cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)

 zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)

 ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c
@@ -1966,9 +1980,14 @@ cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h
 cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 cblas_sgemmtr.$(SUFFIX) cblas_sgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 ifeq ($(BUILD_BFLOAT16),1)
 cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 cblas_sbgemmtr.$(SUFFIX) cblas_sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif

 cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h
@@ -1980,6 +1999,15 @@ cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h
 cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 cblas_dgemmtr.$(SUFFIX) cblas_dgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 cblas_cgemmtr.$(SUFFIX) cblas_cgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 cblas_zgemmtr.$(SUFFIX) cblas_zgemmtr.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -39,6 +39,7 @@

 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
@@ -85,7 +86,7 @@
 #endif

 static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = {
 #ifndef GEMM3M
 #if !defined(GEMM3M) || defined(GENERIC)
  GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN,
  GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT,
  GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR,
@@ -498,7 +499,16 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	 args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
 #endif

 #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16)
 #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16))
 #if defined(ARCH_ARM64)
  // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c}
  // perform poorly in certain circumstances. We use the following boolean
  // variable along with the gemv argument values to avoid these inefficient
  // gemv cases, see github issue#4951.
  bool have_tuned_gemv = false;
 #else
  bool have_tuned_gemv = true;
 #endif
  // Check if we can convert GEMM -> GEMV
  if (args.k != 0) {
    if (args.n == 1) {
@@ -518,8 +528,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
      if (transb & 1) {
        inc_x = args.ldb;
      }
      GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y);
      return;
      bool is_efficient_gemv = have_tuned_gemv || ((NT == 'N') || (NT == 'T' && inc_x == 1));
      if (is_efficient_gemv) {
        GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y);
        return;
      }
    }
    if (args.m == 1) {
      blasint inc_x = args.lda;
@@ -538,8 +551,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
        m = args.n;
        n = args.k;
      }
      GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y);
      return;
      bool is_efficient_gemv = have_tuned_gemv || ((NT == 'N' && inc_y == 1) || (NT == 'T' && inc_x == 1));
      if (is_efficient_gemv) {
        GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y);
        return;
      }
    }
  }
 #endif
@@ -572,7 +588,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

  buffer = (XFLOAT *)blas_memory_alloc(0);

 //For target LOONGSON3R5, applying an offset to the buffer is essential
 //For LOONGARCH64, applying an offset to the buffer is essential
 //for minimizing cache conflicts and optimizing performance.
 #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)
  sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A);
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -319,8 +319,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		lda = LDB;
 		ldb = LDA;

 		if (Uplo == CblasUpper) uplo = 0;
 		if (Uplo == CblasLower) uplo = 1;
 		if (Uplo == CblasUpper) uplo = 1;
 		if (Uplo == CblasLower) uplo = 0;

 		if (TransB == CblasNoTrans)
 			transa = 0;
--- a/interface/lapack/trtri.c
+++ b/interface/lapack/trtri.c
@@ -127,6 +127,9 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
 #endif

 #ifdef SMP
 if (args.n <= 150)
  args.nthreads = 1;
 else
  args.nthreads = num_cpu_avail(4);

  if (args.nthreads == 1) {
--- a/interface/sbgemv.c
+++ b/interface/sbgemv.c
@@ -178,21 +178,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasi
    if (incy < 0) {y -= (leny - 1) * incy;}

 #ifdef SMP
    int thread_thres_row = 20480;
    if (trans) {
        if (n <= thread_thres_row) {
            nthreads = 1;
        } else {
            nthreads = num_cpu_avail(1);
        }
    } else {
        if (m <= thread_thres_row) {
            nthreads = 1;
        } else {
            nthreads = num_cpu_avail(1);
        }
    }

    if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
      nthreads = 1;
    else
      nthreads = num_cpu_avail(2);

    if (nthreads == 1) {
 #endif
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -2,5 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE

 SGEMVNKERNEL = gemv_n_sve.c
 DGEMVNKERNEL = gemv_n_sve.c
 SGEMVTKERNEL = gemv_t_sve.c
 DGEMVTKERNEL = gemv_t_sve.c
 SGEMVTKERNEL = gemv_t_sve_v4x3.c
 DGEMVTKERNEL = gemv_t_sve_v4x3.c
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -64,8 +64,8 @@ DAXPYKERNEL  = daxpy_thunderx2t99.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S

 SROTKERNEL   = rot.S
 DROTKERNEL   = rot.S
 SROTKERNEL   = rot.c
 DROTKERNEL   = rot.c
 CROTKERNEL   = zrot.S
 ZROTKERNEL   = zrot.S

@@ -94,8 +94,8 @@ DCOPYKERNEL    = copy_thunderx2t99.c
 CCOPYKERNEL    = copy_thunderx2t99.c
 ZCOPYKERNEL    = copy_thunderx2t99.c

 SSWAPKERNEL    = swap_thunderx2t99.S
 DSWAPKERNEL    = swap_thunderx2t99.S
 SSWAPKERNEL    = swap.c
 DSWAPKERNEL    = swap.c
 CSWAPKERNEL    = swap_thunderx2t99.S
 ZSWAPKERNEL    = swap_thunderx2t99.S

@@ -104,10 +104,10 @@ IDAMAXKERNEL   = iamax_thunderx2t99.c
 ICAMAXKERNEL   = izamax_thunderx2t99.c
 IZAMAXKERNEL   = izamax_thunderx2t99.c

 SNRM2KERNEL    = scnrm2_thunderx2t99.c
 DNRM2KERNEL    = dznrm2_thunderx2t99.c
 CNRM2KERNEL    = scnrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 SNRM2KERNEL    = nrm2.S
 DNRM2KERNEL    = nrm2.S
 CNRM2KERNEL    = znrm2.S
 ZNRM2KERNEL    = znrm2.S

 DDOTKERNEL     = dot.c
 SDOTKERNEL     = dot.c
--- a/kernel/arm64/KERNEL.NEOVERSEN2
+++ b/kernel/arm64/KERNEL.NEOVERSEN2
@@ -91,10 +91,10 @@ IDAMAXKERNEL   = iamax_thunderx2t99.c
 ICAMAXKERNEL   = izamax_thunderx2t99.c
 IZAMAXKERNEL   = izamax_thunderx2t99.c

 SNRM2KERNEL    = scnrm2_thunderx2t99.c
 DNRM2KERNEL    = dznrm2_thunderx2t99.c
 CNRM2KERNEL    = scnrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 SNRM2KERNEL    = nrm2.S
 DNRM2KERNEL    = nrm2.S
 CNRM2KERNEL    = znrm2.S
 ZNRM2KERNEL    = znrm2.S

 DDOTKERNEL     = dot.c
 SDOTKERNEL     = dot.c
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -1,4 +1,4 @@
 include $(KERNELDIR)/KERNEL.ARMV8SVE

 SGEMVTKERNEL = gemv_t_sve.c
 DGEMVTKERNEL = gemv_t_sve.c
 SGEMVTKERNEL = gemv_t_sve_v1x3.c
 DGEMVTKERNEL = gemv_t_sve_v1x3.c
--- a/kernel/arm64/dgemm_small_kernel_tn_sve.c
+++ b/kernel/arm64/dgemm_small_kernel_tn_sve.c
@@ -211,6 +211,7 @@ CNAME(BLASLONG M,
  const BLASLONG v_m1 = M & -v_size;
  const BLASLONG n4 = N & -4;
  const BLASLONG n2 = N & -2;
  const BLASLONG n8 = N & -8;

  const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0;
  FLOAT* packed_a =
@@ -229,28 +230,37 @@ CNAME(BLASLONG M,
    CREATE_A_POINTER(1, v_size);

    BLASLONG j = 0;
    for (; j < n4; j += 4) {

    for (; j < n8; j += 8) {
      CREATE_B_POINTER(0, 0);
      CREATE_B_POINTER(1, 1);
      CREATE_B_POINTER(2, 2);
      CREATE_B_POINTER(3, 3);
      UPDATE_B_POINTER(4);
      CREATE_B_POINTER(4, 4);
      CREATE_B_POINTER(5, 5);
      CREATE_B_POINTER(6, 6);
      CREATE_B_POINTER(7, 7);
      UPDATE_B_POINTER(8);

      BLASLONG k = 0;
      DECLARE_RESULT_VECTOR(0, 0);
      DECLARE_RESULT_VECTOR(0, 1);
      DECLARE_RESULT_VECTOR(0, 2);
      DECLARE_RESULT_VECTOR(0, 3);
      DECLARE_RESULT_VECTOR(0, 4);
      DECLARE_RESULT_VECTOR(0, 5);
      DECLARE_RESULT_VECTOR(0, 6);
      DECLARE_RESULT_VECTOR(0, 7);
      DECLARE_RESULT_VECTOR(1, 0);
      DECLARE_RESULT_VECTOR(1, 1);
      DECLARE_RESULT_VECTOR(1, 2);
      DECLARE_RESULT_VECTOR(1, 3);

      DECLARE_RESULT_VECTOR(1, 4);
      DECLARE_RESULT_VECTOR(1, 5);
      DECLARE_RESULT_VECTOR(1, 6);
      DECLARE_RESULT_VECTOR(1, 7);
      if (LIKELY(packed_a != NULL)) {
        if (j == 0) {
          for (; k < K; k++) {

            BROADCAST_LOAD_B(0, 0);
            GATHER_LOAD_A(pg_true, 0, 0);
            VECTOR_PACK_A(0, 0);
@@ -267,10 +277,21 @@ CNAME(BLASLONG M,
            BROADCAST_LOAD_B(3, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
            BROADCAST_LOAD_B(4, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0);
            BROADCAST_LOAD_B(5, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0);
            BROADCAST_LOAD_B(6, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0);
            BROADCAST_LOAD_B(7, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0);
          }
        } else {
          for (; k < K; k++) {

            BROADCAST_LOAD_B(0, 0);
            UNPACK_VECTOR_A(0, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
@@ -285,7 +306,104 @@ CNAME(BLASLONG M,
            BROADCAST_LOAD_B(3, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
            BROADCAST_LOAD_B(4, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0);
            BROADCAST_LOAD_B(5, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0);
            BROADCAST_LOAD_B(6, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0);
            BROADCAST_LOAD_B(7, 0);
            UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0);
            UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0);
          }
       }
      } else {
      for (; k < K; k++) {
          BROADCAST_LOAD_B(0, 0);
          GATHER_LOAD_A(pg_true, 0, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
          BROADCAST_LOAD_B(1, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
          GATHER_LOAD_A(pg_true, 1, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
          BROADCAST_LOAD_B(2, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
          BROADCAST_LOAD_B(3, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
          BROADCAST_LOAD_B(4, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0);
          BROADCAST_LOAD_B(5, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0);
          BROADCAST_LOAD_B(6, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0);
          BROADCAST_LOAD_B(7, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0);
        }
      }
      VECTOR_STORE(pg_true, 0, 0);
      VECTOR_STORE(pg_true, 0, 1);
      VECTOR_STORE(pg_true, 0, 2);
      VECTOR_STORE(pg_true, 0, 3);
      VECTOR_STORE(pg_true, 0, 4);
      VECTOR_STORE(pg_true, 0, 5);
      VECTOR_STORE(pg_true, 0, 6);
      VECTOR_STORE(pg_true, 0, 7);
      VECTOR_STORE(pg_true, 1, 0);
      VECTOR_STORE(pg_true, 1, 1);
      VECTOR_STORE(pg_true, 1, 2);
      VECTOR_STORE(pg_true, 1, 3);
      VECTOR_STORE(pg_true, 1, 4);
      VECTOR_STORE(pg_true, 1, 5);
      VECTOR_STORE(pg_true, 1, 6);
      VECTOR_STORE(pg_true, 1, 7);
      INCR_C_POINTER(0, 8);
      INCR_C_POINTER(1, 8);
    }
    for (; j < n4; j += 4) {

      CREATE_B_POINTER(0, 0);
      CREATE_B_POINTER(1, 1);
      CREATE_B_POINTER(2, 2);
      CREATE_B_POINTER(3, 3);
      UPDATE_B_POINTER(4);

      BLASLONG k = 0;
      DECLARE_RESULT_VECTOR(0, 0);
      DECLARE_RESULT_VECTOR(0, 1);
      DECLARE_RESULT_VECTOR(0, 2);
      DECLARE_RESULT_VECTOR(0, 3);
      DECLARE_RESULT_VECTOR(1, 0);
      DECLARE_RESULT_VECTOR(1, 1);
      DECLARE_RESULT_VECTOR(1, 2);
      DECLARE_RESULT_VECTOR(1, 3);

      if (LIKELY(packed_a != NULL)) {
        for (; k < K; k++) {

          BROADCAST_LOAD_B(0, 0);
          UNPACK_VECTOR_A(0, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
          BROADCAST_LOAD_B(1, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
          UNPACK_VECTOR_A(1, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0);
          BROADCAST_LOAD_B(2, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0);
          BROADCAST_LOAD_B(3, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
          UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0);
        }
      } else {
        for (; k < K; k++) {
@@ -405,6 +523,55 @@ CNAME(BLASLONG M,
    CREATE_A_POINTER(0, 0);

    BLASLONG j = 0;
       for (; j < n8; j += 8) {
      CREATE_B_POINTER(0, 0);
      CREATE_B_POINTER(1, 1);
      CREATE_B_POINTER(2, 2);
      CREATE_B_POINTER(3, 3);
      CREATE_B_POINTER(4, 4);
      CREATE_B_POINTER(5, 5);
      CREATE_B_POINTER(6, 6);
      CREATE_B_POINTER(7, 7);
      UPDATE_B_POINTER(8);

      BLASLONG k = 0;
      DECLARE_RESULT_VECTOR(0, 0);
      DECLARE_RESULT_VECTOR(0, 1);
      DECLARE_RESULT_VECTOR(0, 2);
      DECLARE_RESULT_VECTOR(0, 3);
      DECLARE_RESULT_VECTOR(0, 4);
      DECLARE_RESULT_VECTOR(0, 5);
      DECLARE_RESULT_VECTOR(0, 6);
      DECLARE_RESULT_VECTOR(0, 7);         
      for (; k < K; k++) {
          BROADCAST_LOAD_B(0, 0);
          GATHER_LOAD_A(pg_true, 0, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0);
          BROADCAST_LOAD_B(1, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0);
          BROADCAST_LOAD_B(2, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0);
          BROADCAST_LOAD_B(3, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0);
          BROADCAST_LOAD_B(4, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0);
          BROADCAST_LOAD_B(5, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0);
          BROADCAST_LOAD_B(6, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0);
          BROADCAST_LOAD_B(7, 0);
          UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0);
        }
      VECTOR_STORE(pg_true, 0, 0);
      VECTOR_STORE(pg_true, 0, 1);
      VECTOR_STORE(pg_true, 0, 2);
      VECTOR_STORE(pg_true, 0, 3);
      VECTOR_STORE(pg_true, 0, 4);
      VECTOR_STORE(pg_true, 0, 5);
      VECTOR_STORE(pg_true, 0, 6);
      VECTOR_STORE(pg_true, 0, 7);
      INCR_C_POINTER(0, 8);
    }
    for (; j < n4; j += 4) {

      CREATE_B_POINTER(0, 0);
@@ -487,6 +654,55 @@ CNAME(BLASLONG M,
    CREATE_A_POINTER(0, 0);

    BLASLONG j = 0;
    for (; j < n8; j += 8) {
      CREATE_B_POINTER(0, 0);
      CREATE_B_POINTER(1, 1);
      CREATE_B_POINTER(2, 2);
      CREATE_B_POINTER(3, 3);
      CREATE_B_POINTER(4, 4);
      CREATE_B_POINTER(5, 5);
      CREATE_B_POINTER(6, 6);
      CREATE_B_POINTER(7, 7);
      UPDATE_B_POINTER(8);

      BLASLONG k = 0;
      DECLARE_RESULT_VECTOR(0, 0);
      DECLARE_RESULT_VECTOR(0, 1);
      DECLARE_RESULT_VECTOR(0, 2);
      DECLARE_RESULT_VECTOR(0, 3);
      DECLARE_RESULT_VECTOR(0, 4);
      DECLARE_RESULT_VECTOR(0, 5);
      DECLARE_RESULT_VECTOR(0, 6);
      DECLARE_RESULT_VECTOR(0, 7);         
      for (; k < K; k++) {
          BROADCAST_LOAD_B(0, 0);
          GATHER_LOAD_A(pg_tail, 0, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0);
          BROADCAST_LOAD_B(1, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0);
          BROADCAST_LOAD_B(2, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0);
          BROADCAST_LOAD_B(3, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0);
          BROADCAST_LOAD_B(4, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 4, 0);
          BROADCAST_LOAD_B(5, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 5, 0);
          BROADCAST_LOAD_B(6, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 6, 0);
          BROADCAST_LOAD_B(7, 0);
          UPDATE_RESULT_VECTOR(pg_tail, 0, 7, 0);
        }
      VECTOR_STORE(pg_tail, 0, 0);
      VECTOR_STORE(pg_tail, 0, 1);
      VECTOR_STORE(pg_tail, 0, 2);
      VECTOR_STORE(pg_tail, 0, 3);
      VECTOR_STORE(pg_tail, 0, 4);
      VECTOR_STORE(pg_tail, 0, 5);
      VECTOR_STORE(pg_tail, 0, 6);
      VECTOR_STORE(pg_tail, 0, 7);
      INCR_C_POINTER(0, 8);
    }
    for (; j < n4; j += 4) {

      CREATE_B_POINTER(0, 0);
--- a/kernel/arm64/gemv_t_sve_v1x3.c
+++ b/kernel/arm64/gemv_t_sve_v1x3.c
@@ -0,0 +1,152 @@
 /***************************************************************************
 Copyright (c) 2024, The OpenBLAS Project
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:

   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
   3. Neither the name of the OpenBLAS project nor the names of
      its contributors may be used to endorse or promote products
      derived from this software without specific prior written
      permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include <arm_sve.h>

 #include "common.h"

 #ifdef DOUBLE
 #define SV_COUNT svcntd
 #define SV_TYPE svfloat64_t
 #define SV_TRUE svptrue_b64
 #define SV_WHILE svwhilelt_b64_s64
 #define SV_DUP svdup_f64
 #else
 #define SV_COUNT svcntw
 #define SV_TYPE svfloat32_t
 #define SV_TRUE svptrue_b32
 #define SV_WHILE svwhilelt_b32_s64
 #define SV_DUP svdup_f32
 #endif

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
          FLOAT *buffer)
 {
  BLASLONG i;
  BLASLONG ix,iy;
  BLASLONG j;
  FLOAT *a_ptr;
  FLOAT temp;

  iy = 0;

  if (inc_x == 1) {
    BLASLONG width = (n + 3 - 1) / 3;

    FLOAT *a0_ptr = a + lda * width * 0;
    FLOAT *a1_ptr = a + lda * width * 1;
    FLOAT *a2_ptr = a + lda * width * 2;

    FLOAT *y0_ptr = y + inc_y * width * 0;
    FLOAT *y1_ptr = y + inc_y * width * 1;
    FLOAT *y2_ptr = y + inc_y * width * 2;

    for (j = 0; j < width; j++) {
      svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

      SV_TYPE temp00_vec = SV_DUP(0.0);
      SV_TYPE temp01_vec = SV_DUP(0.0);
      SV_TYPE temp02_vec = SV_DUP(0.0);

      i = 0;
      BLASLONG sve_size = SV_COUNT();
      while ((i + sve_size * 1 - 1) < m) {
        SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);

        SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
        SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
        SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

        temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
        temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
        temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);

        i += sve_size * 1;
      }

      if (i < m) {
        svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

        pg00 = svand_z(SV_TRUE(), pg0, pg00);
        pg01 = svand_z(SV_TRUE(), pg0, pg01);
        pg02 = svand_z(SV_TRUE(), pg0, pg02);

        SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);

        SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
        SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
        SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

        temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
        temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
        temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
      }

      if ((j + width * 0) < n) {
        temp = svaddv(SV_TRUE(), temp00_vec);
        y0_ptr[iy] += alpha * temp;
      }
      if ((j + width * 1) < n) {
        temp = svaddv(SV_TRUE(), temp01_vec);
        y1_ptr[iy] += alpha * temp;
      }
      if ((j + width * 2) < n) {
        temp = svaddv(SV_TRUE(), temp02_vec);
        y2_ptr[iy] += alpha * temp;
      }
      iy += inc_y;

      a0_ptr += lda;
      a1_ptr += lda;
      a2_ptr += lda;
    }

    return(0);
  }

  a_ptr = a;
  for (j = 0; j < n; j++) {
    temp = 0.0;
    ix = 0;
    for (i = 0; i < m; i++) {
      temp += a_ptr[i] * x[ix];
      ix += inc_x;
    }
    y[iy] += alpha * temp;
    iy += inc_y;
    a_ptr += lda;
  }
  return(0);
 }
--- a/kernel/arm64/gemv_t_sve_v4x3.c
+++ b/kernel/arm64/gemv_t_sve_v4x3.c
@@ -0,0 +1,234 @@
 /***************************************************************************
 Copyright (c) 2024, The OpenBLAS Project
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:

   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
   3. Neither the name of the OpenBLAS project nor the names of
      its contributors may be used to endorse or promote products
      derived from this software without specific prior written
      permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include <arm_sve.h>

 #include "common.h"

 #ifdef DOUBLE
 #define SV_COUNT svcntd
 #define SV_TYPE svfloat64_t
 #define SV_TRUE svptrue_b64
 #define SV_WHILE svwhilelt_b64_s64
 #define SV_DUP svdup_f64
 #else
 #define SV_COUNT svcntw
 #define SV_TYPE svfloat32_t
 #define SV_TRUE svptrue_b32
 #define SV_WHILE svwhilelt_b32_s64
 #define SV_DUP svdup_f32
 #endif

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
          FLOAT *buffer)
 {
  BLASLONG i;
  BLASLONG ix,iy;
  BLASLONG j;
  FLOAT *a_ptr;
  FLOAT temp;

  iy = 0;

  if (inc_x == 1) {
    BLASLONG width = (n + 3 - 1) / 3;

    FLOAT *a0_ptr = a + lda * width * 0;
    FLOAT *a1_ptr = a + lda * width * 1;
    FLOAT *a2_ptr = a + lda * width * 2;

    FLOAT *y0_ptr = y + inc_y * width * 0;
    FLOAT *y1_ptr = y + inc_y * width * 1;
    FLOAT *y2_ptr = y + inc_y * width * 2;

    for (j = 0; j < width; j++) {
      svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();
      svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

      SV_TYPE temp00_vec = SV_DUP(0.0);
      SV_TYPE temp10_vec = SV_DUP(0.0);
      SV_TYPE temp20_vec = SV_DUP(0.0);
      SV_TYPE temp30_vec = SV_DUP(0.0);
      SV_TYPE temp01_vec = SV_DUP(0.0);
      SV_TYPE temp11_vec = SV_DUP(0.0);
      SV_TYPE temp21_vec = SV_DUP(0.0);
      SV_TYPE temp31_vec = SV_DUP(0.0);
      SV_TYPE temp02_vec = SV_DUP(0.0);
      SV_TYPE temp12_vec = SV_DUP(0.0);
      SV_TYPE temp22_vec = SV_DUP(0.0);
      SV_TYPE temp32_vec = SV_DUP(0.0);

      i = 0;
      BLASLONG sve_size = SV_COUNT();
      while ((i + sve_size * 4 - 1) < m) {
        SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);
        SV_TYPE x1_vec = svld1_vnum(SV_TRUE(), x + i, 1);
        SV_TYPE x2_vec = svld1_vnum(SV_TRUE(), x + i, 2);
        SV_TYPE x3_vec = svld1_vnum(SV_TRUE(), x + i, 3);

        SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
        SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1);
        SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2);
        SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3);
        SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
        SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1);
        SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2);
        SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3);
        SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
        SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1);
        SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2);
        SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3);

        temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
        temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec);
        temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec);
        temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec);
        temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
        temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec);
        temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec);
        temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec);
        temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
        temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec);
        temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec);
        temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec);

        i += sve_size * 4;
      }

      if (i < m) {
        svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);
        svbool_t pg1 = SV_WHILE(i + sve_size * 1, m);
        svbool_t pg2 = SV_WHILE(i + sve_size * 2, m);
        svbool_t pg3 = SV_WHILE(i + sve_size * 3, m);

        pg00 = svand_z(SV_TRUE(), pg0, pg00);
        pg10 = svand_z(SV_TRUE(), pg1, pg10);
        pg20 = svand_z(SV_TRUE(), pg2, pg20);
        pg30 = svand_z(SV_TRUE(), pg3, pg30);
        pg01 = svand_z(SV_TRUE(), pg0, pg01);
        pg11 = svand_z(SV_TRUE(), pg1, pg11);
        pg21 = svand_z(SV_TRUE(), pg2, pg21);
        pg31 = svand_z(SV_TRUE(), pg3, pg31);
        pg02 = svand_z(SV_TRUE(), pg0, pg02);
        pg12 = svand_z(SV_TRUE(), pg1, pg12);
        pg22 = svand_z(SV_TRUE(), pg2, pg22);
        pg32 = svand_z(SV_TRUE(), pg3, pg32);

        SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);
        SV_TYPE x1_vec = svld1_vnum(pg1, x + i, 1);
        SV_TYPE x2_vec = svld1_vnum(pg2, x + i, 2);
        SV_TYPE x3_vec = svld1_vnum(pg3, x + i, 3);

        SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
        SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1);
        SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2);
        SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3);
        SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
        SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1);
        SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2);
        SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3);
        SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
        SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1);
        SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2);
        SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3);

        temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
        temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec);
        temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec);
        temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec);
        temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
        temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec);
        temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec);
        temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec);
        temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
        temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec);
        temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec);
        temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec);
      }

      temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp10_vec);
      temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp11_vec);
      temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp12_vec);
      temp20_vec = svadd_x(SV_TRUE(), temp20_vec, temp30_vec);
      temp21_vec = svadd_x(SV_TRUE(), temp21_vec, temp31_vec);
      temp22_vec = svadd_x(SV_TRUE(), temp22_vec, temp32_vec);
      temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp20_vec);
      temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp21_vec);
      temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp22_vec);

      if ((j + width * 0) < n) {
        temp = svaddv(SV_TRUE(), temp00_vec);
        y0_ptr[iy] += alpha * temp;
      }
      if ((j + width * 1) < n) {
        temp = svaddv(SV_TRUE(), temp01_vec);
        y1_ptr[iy] += alpha * temp;
      }
      if ((j + width * 2) < n) {
        temp = svaddv(SV_TRUE(), temp02_vec);
        y2_ptr[iy] += alpha * temp;
      }
      iy += inc_y;

      a0_ptr += lda;
      a1_ptr += lda;
      a2_ptr += lda;
    }

    return(0);
  }

  a_ptr = a;
  for (j = 0; j < n; j++) {
    temp = 0.0;
    ix = 0;
    for (i = 0; i < m; i++) {
      temp += a_ptr[i] * x[ix];
      ix += inc_x;
    }
    y[iy] += alpha * temp;
    iy += inc_y;
    a_ptr += lda;
  }
  return(0);
 }
--- a/kernel/arm64/rot.c
+++ b/kernel/arm64/rot.c
@@ -0,0 +1,40 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #include "common.h"
 #include "rot_kernel_sve.c"
 #include "rot_kernel_c.c"

 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
        if (n <= 0)
                return (0);
        if (inc_x == 1 && inc_y == 1)
                rot_kernel_sve(n, x, y, c, s);
        else
                rot_kernel_c(n, x, inc_x, y, inc_y, c, s);
        return (0);
 }
--- a/kernel/arm64/rot_kernel_c.c
+++ b/kernel/arm64/rot_kernel_c.c
@@ -0,0 +1,44 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #include "common.h"

 static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
        BLASLONG i = 0;
        BLASLONG ix = 0, iy = 0;
        FLOAT temp;
        while (i < n)
        {
                temp = c * x[ix] + s * y[iy];
                y[iy] = c * y[iy] - s * x[ix];
                x[ix] = temp;
                ix += inc_x;
                iy += inc_y;
                i++;
        }
        return (0);
 }
--- a/kernel/arm64/rot_kernel_sve.c
+++ b/kernel/arm64/rot_kernel_sve.c
@@ -0,0 +1,59 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #include "common.h"
 #include <arm_sve.h>

 #ifdef DOUBLE
 #define SVE_TYPE svfloat64_t
 #define SVE_ZERO svdup_f64(0.0)
 #define SVE_WHILELT svwhilelt_b64
 #define SVE_ALL svptrue_b64()
 #define SVE_WIDTH svcntd()
 #else
 #define SVE_TYPE svfloat32_t
 #define SVE_ZERO svdup_f32(0.0)
 #define SVE_WHILELT svwhilelt_b32
 #define SVE_ALL svptrue_b32()
 #define SVE_WIDTH svcntw()
 #endif

 static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
       for (BLASLONG i = 0; i < n; i += SVE_WIDTH)
              {
              svbool_t pg = SVE_WHILELT((uint64_t)i, (uint64_t)n);
              SVE_TYPE x_vec = svld1(pg, &x[i]);
              SVE_TYPE y_vec = svld1(pg, &y[i]);
              SVE_TYPE cx_vec = svmul_z(pg, x_vec, c);
              SVE_TYPE sy_vec = svmul_z(pg, y_vec, s);
              SVE_TYPE sx_vec = svmul_z(pg, x_vec, s);
              SVE_TYPE cy_vec = svmul_z(pg, y_vec, c);
              svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec));
              svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec));
       }
       return (0);
 }
--- a/kernel/arm64/swap.c
+++ b/kernel/arm64/swap.c
@@ -0,0 +1,40 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include "common.h"
 #include "swap_kernel_sve.c"
 #include "swap_kernel_c.c"

 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	if (n <= 0)
 		return 0;
 	if (inc_x == 1 && inc_y == 1)
 		swap_kernel_sve(n, x, y);
 	else
 		swap_kernel_c(n, x, inc_x, y, inc_y);
 	return (0);
 }
--- a/kernel/arm64/swap_kernel_c.c
+++ b/kernel/arm64/swap_kernel_c.c
@@ -0,0 +1,46 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include "common.h"
 #include <stdio.h>

 static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
        BLASLONG i = 0;
        BLASLONG ix = 0, iy = 0;
        FLOAT temp;

        while (i < n)
        {
                temp = x[ix];
                x[ix] = y[iy];
                y[iy] = temp;
                ix += inc_x;
                iy += inc_y;
                i++;
        }
        return (0);
 }
--- a/kernel/arm64/swap_kernel_sve.c
+++ b/kernel/arm64/swap_kernel_sve.c
@@ -0,0 +1,62 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #include "common.h"
 #include <arm_sve.h>

 #ifdef DOUBLE
 #define SVE_TYPE svfloat64_t
 #define SVE_ZERO svdup_f64(0.0)
 #define SVE_WHILELT svwhilelt_b64
 #define SVE_ALL svptrue_b64()
 #define SVE_WIDTH svcntd()
 #else
 #define SVE_TYPE svfloat32_t
 #define SVE_ZERO svdup_f32(0.0)
 #define SVE_WHILELT svwhilelt_b32
 #define SVE_ALL svptrue_b32()
 #define SVE_WIDTH svcntw()
 #endif

 static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y)
 {
        BLASLONG sve_width = SVE_WIDTH;

        for (BLASLONG i = 0; i < n; i += sve_width * 2)
        {
                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
                SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
                SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
                svst1(pg_a, &x[i], y_vec_a);
                svst1(pg_a, &y[i], x_vec_a);
                svst1(pg_b, &x[i + sve_width], y_vec_b);
                svst1(pg_b, &y[i + sve_width], x_vec_b);
        }
        return (0);
 }
--- a/kernel/generic/zgemm_beta.c
+++ b/kernel/generic/zgemm_beta.c
@@ -58,8 +58,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
  c_offset = c;

  if (beta_r == 0. && beta_i == 0.) {
    j = n;
    do {
    
    for (j=n;j>0;j--) {
      c_offset1 = c_offset;
      c_offset += ldc;

@@ -88,13 +88,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
 	  i--;
 	} while (i > 0);
      }
      j --;
    } while (j > 0);
    }

  } else {

    j = n;
    do {

    for (j=n;j>0;j--) {
      c_offset1 = c_offset;
      c_offset += ldc;

@@ -151,8 +150,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
 	  i --;
 	} while (i > 0);
      }
      j --;
    } while (j > 0);
    }
  }
  return 0;
 }
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000