diff --git a/.drone.yml b/.drone.yml
index b1c211d14..38ded2015 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -190,3 +190,27 @@ steps:
     - make -C ctest $COMMON_FLAGS
     - make -C utest $COMMON_FLAGS
     - make -C cpp_thread_test dgemm_tester
+---
+kind: pipeline
+name: arm64_gcc10
+
+platform:
+  os: linux
+  arch: arm64
+
+steps:
+- name: Build and Test
+  image: ubuntu:20.04
+  environment:
+    CC: gcc-10
+    FC: gfortran-10
+    COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
+  commands:
+    - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+    - apt-get update -y
+    - apt-get install -y make $CC gfortran-10 perl python g++
+    - $CC --version
+    - make QUIET_MAKE=1 $COMMON_FLAGS
+    - make -C utest $COMMON_FLAGS
+    - make -C test $COMMON_FLAGS
+    
diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
index 8d7cfea2d..29ec96f73 100644
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -43,7 +43,7 @@ jobs:
       - name: Update Homebrew
         if: github.event_name != 'pull_request'
         run: brew update || true
-
+          
       - name: Install prerequisites
         run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
 
diff --git a/.gitignore b/.gitignore
index bca79f043..0fe20ecaa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,5 +89,7 @@ build.*
 *.swp
 benchmark/*.goto
 benchmark/smallscaling
+.vscode
 CMakeCache.txt
 CMakeFiles/*
+.vscode
diff --git a/.travis.yml b/.travis.yml
index bde0e202d..85a57f6e3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,33 +1,38 @@
 # XXX: Precise is already deprecated, new default is Trusty.
 # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
-dist: precise
+dist: focal
 sudo: true
 language: c
 
 matrix:
   include:
     - &test-ubuntu
-      os: linux
+#      os: linux
       compiler: gcc
       addons:
         apt:
           packages:
             - gfortran
+#      before_script: &common-before
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
+#      script:
+#        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#        - make -C test $COMMON_FLAGS $BTYPE
+#        - make -C ctest $COMMON_FLAGS $BTYPE
+#        - make -C utest $COMMON_FLAGS $BTYPE
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64"
+#
+#    - <<: *test-ubuntu
+      os: linux-ppc64le
       before_script: &common-before
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
       script:
         - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64"
-
-    - <<: *test-ubuntu
-      os: linux-ppc64le
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
       env:
         # for matrix annotation only
         - TARGET_BOX=PPC64LE_LINUX
@@ -55,38 +60,38 @@ matrix:
         - TARGET_BOX=IBMZ_LINUX
         - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang"
 
-    - <<: *test-ubuntu
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 USE_OPENMP=1"
-
-    - <<: *test-ubuntu
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 INTERFACE64=1"
-
-    - <<: *test-ubuntu
-      compiler: clang
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 CC=clang"
-
-    - <<: *test-ubuntu
-      compiler: clang
-      env:
-        - TARGET_BOX=LINUX64
-        - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
-
-    - <<: *test-ubuntu
-      addons:
-        apt:
-          packages:
-            - gcc-multilib
-            - gfortran-multilib
-      env:
-        - TARGET_BOX=LINUX32
-        - BTYPE="BINARY=32"
-
+#    - <<: *test-ubuntu
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 USE_OPENMP=1"
+#
+#    - <<: *test-ubuntu
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 INTERFACE64=1"
+#
+#    - <<: *test-ubuntu
+#      compiler: clang
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 CC=clang"
+#
+#    - <<: *test-ubuntu
+#      compiler: clang
+#      env:
+#        - TARGET_BOX=LINUX64
+#        - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
+#
+#    - <<: *test-ubuntu
+#      addons:
+#        apt:
+#          packages:
+#            - gcc-multilib
+#            - gfortran-multilib
+#      env:
+#        - TARGET_BOX=LINUX32
+#        - BTYPE="BINARY=32"
+#
     - os: linux
       arch: ppc64le
       dist: bionic
@@ -121,47 +126,47 @@ matrix:
         # for matrix annotation only
         - TARGET_BOX=PPC64LE_LINUX_P9
 
-    - os: linux
-      compiler: gcc
-      addons:
-        apt:
-          packages:
-            - binutils-mingw-w64-x86-64
-            - gcc-mingw-w64-x86-64
-            - gfortran-mingw-w64-x86-64
-      before_script: *common-before
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - TARGET_BOX=WIN64
-        - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
-
+#    - os: linux
+#      compiler: gcc
+#      addons:
+#        apt:
+#          packages:
+#            - binutils-mingw-w64-x86-64
+#            - gcc-mingw-w64-x86-64
+#            - gfortran-mingw-w64-x86-64
+#      before_script: *common-before
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - TARGET_BOX=WIN64
+#        - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
+#
     # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
     # These jobs needs sudo, so Travis runs them on VM-based infrastructure
     # which is slower than container-based infrastructure used for jobs
     # that don't require sudo.
-    - &test-alpine
-      os: linux
-      dist: trusty
-      sudo: true
-      language: minimal
-      before_install:
-        - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
-          && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1"
-        - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
-      install:
-        - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
-      before_script: *common-before
-      script:
-        # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
-        - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-              CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
-        - alpine make -C test $COMMON_FLAGS $BTYPE
-        - alpine make -C ctest $COMMON_FLAGS $BTYPE
-        - alpine make -C utest $COMMON_FLAGS $BTYPE
-      env:
-        - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64"
+ #   - &test-alpine
+ #     os: linux
+ #     dist: trusty
+ #     sudo: true
+ #     language: minimal
+ #     before_install:
+ #       - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
+ #         && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1"
+ #       - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
+ #     install:
+ #       - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
+ #     before_script: *common-before
+ #     script:
+ #       # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
+ #       - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+ #             CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
+ #       - alpine make -C test $COMMON_FLAGS $BTYPE
+ #       - alpine make -C ctest $COMMON_FLAGS $BTYPE
+ #       - alpine make -C utest $COMMON_FLAGS $BTYPE
+ #     env:
+ #       - TARGET_BOX=LINUX64_MUSL
+ #       - BTYPE="BINARY=64"
 
     # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
     # but only on Travis CI, cannot reproduce it elsewhere.
@@ -171,89 +176,98 @@ matrix:
     #    - TARGET_BOX=LINUX64_MUSL
     #    - BTYPE="BINARY=64 USE_OPENMP=1"
 
-    - <<: *test-alpine
-      env:
-        - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64 INTERFACE64=1"
+#    - <<: *test-alpine
+#      env:
+#        - TARGET_BOX=LINUX64_MUSL
+#        - BTYPE="BINARY=64 INTERFACE64=1"
+#
+#    # Build with the same flags as Alpine do in OpenBLAS package.
+#    - <<: *test-alpine
+#      env:
+#        - TARGET_BOX=LINUX64_MUSL
+#        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
 
-    # Build with the same flags as Alpine do in OpenBLAS package.
-    - <<: *test-alpine
-      env:
-        - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
+#    - &test-cmake
+#      os: linux
+#      compiler: clang
+#      addons:
+#        apt:
+#          packages:
+#            - gfortran
+#            - cmake
+#      dist: trusty
+#      sudo: true
+#      before_script:
+#        - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
+#      script:
+#        - mkdir build
+#        - CONFIG=Release
+#        - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
+#        - cmake --build build --config $CONFIG -- -j2
+#      env:
+#        - CMAKE=1
+#    - <<: *test-cmake
+#      env:
+#        - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
+#    - <<: *test-cmake
+#      compiler: gcc
+#      env:
+#        - CMAKE=1
 
-    - &test-cmake
-      os: linux
-      compiler: clang
-      addons:
-        apt:
-          packages:
-            - gfortran
-            - cmake
-      dist: trusty
-      sudo: true
-      before_script:
-        - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
-      script:
-        - mkdir build
-        - CONFIG=Release
-        - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
-        - cmake --build build --config $CONFIG -- -j2
-      env:
-        - CMAKE=1
-    - <<: *test-cmake
-      env:
-        - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
-    - <<: *test-cmake
-      compiler: gcc
-      env:
-        - CMAKE=1
-
-    - &test-macos
-      os: osx
-      osx_image: xcode11.5
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
+#    - &test-macos
+#      os: osx
+#      osx_image: xcode11.5
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
+#
+#    - <<: *test-macos
+#      osx_image: xcode12
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#        - brew update
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
+#
+#    - <<: *test-macos
+#      osx_image: xcode12
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#        - brew update
+#      script:
+#        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+#      env:
+#        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"      
 
-    - <<: *test-macos
-      osx_image: xcode12
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-        - brew install gcc@10
-      script:
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-      env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
-        
   #  - <<: *test-macos
   #    osx_image: xcode10
   #    env:
   #      - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 
-    - <<: *test-macos
-      osx_image: xcode11.5
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-      env:
+#    - <<: *test-macos
+#      osx_image: xcode11.5
+#      before_script:
+#        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+#        - brew update
+#      env:
 #        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
 #        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
-        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
-        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
-    - <<: *test-macos
-      osx_image: xcode11.5
-      env:
-#        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-#        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
-        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
-        - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
+#        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
+#        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
+#    - <<: *test-macos
+#      osx_image: xcode11.5
+#      env:
+##        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+##        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
+#        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
+#        - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
 
     - &test-graviton2
       os: linux
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aeb4399e4..ab9f3af80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,10 +3,13 @@
 ##
 
 cmake_minimum_required(VERSION 2.8.5)
+
 project(OpenBLAS C ASM)
+
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 12.dev)
+set(OpenBLAS_PATCH_VERSION 20)
+
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
@@ -14,54 +17,74 @@ include(GNUInstallDirs)
 
 include(CMakePackageConfigHelpers)
 
+if(MSVC AND NOT DEFINED NOFORTRAN)
+  set(NOFORTRAN ON)
+endif()
 
 #######
 if(MSVC)
-option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
+  option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
 endif()
+
 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
+
 option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
+
 option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
+
 option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
+
 option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
+
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
+  option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
 else()
-set(NO_AFFINITY 1)
+  set(NO_AFFINITY 1)
 endif()
+
 option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
+
 option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
+option(BUILD_STATIC_LIBS "Build static library" OFF)
+if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
+  set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
+endif()
+if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC)
+  message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS")
+  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE)
+endif()
 
 # Add a prefix or suffix to all exported symbol names in the shared library.
 # Avoids conflicts with other BLAS libraries, especially when using
 # 64 bit integer interfaces in OpenBLAS.
-
 set(SYMBOLPREFIX "" CACHE STRING  "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
+
 set(SYMBOLSUFFIX "" CACHE STRING  "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
+
 #######
 if(BUILD_WITHOUT_LAPACK)
-set(NO_LAPACK 1)
-set(NO_LAPACKE 1)
+  set(NO_LAPACK 1)
+  set(NO_LAPACKE 1)
 endif()
 
 if(BUILD_WITHOUT_CBLAS)
-set(NO_CBLAS 1)
+  set(NO_CBLAS 1)
 endif()
 
 #######
 
 if(MSVC AND MSVC_STATIC_CRT)
-    set(CompilerFlags
-            CMAKE_CXX_FLAGS
-            CMAKE_CXX_FLAGS_DEBUG
-            CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_C_FLAGS
-            CMAKE_C_FLAGS_DEBUG
-            CMAKE_C_FLAGS_RELEASE
-            )
-    foreach(CompilerFlag ${CompilerFlags})
-      string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-    endforeach()
+  set(CompilerFlags
+        CMAKE_CXX_FLAGS
+        CMAKE_CXX_FLAGS_DEBUG
+        CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_C_FLAGS
+        CMAKE_C_FLAGS_DEBUG
+        CMAKE_C_FLAGS_RELEASE
+        )
+  foreach(CompilerFlag ${CompilerFlags})
+    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
+  endforeach()
 endif()
 
 message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
@@ -95,7 +118,7 @@ endif ()
 # set which float types we want to build for
 if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
   # if none are defined, build for all
-#  set(BUILD_BFLOAT16 true)
+  # set(BUILD_BFLOAT16 true)
   set(BUILD_SINGLE true)
   set(BUILD_DOUBLE true)
   set(BUILD_COMPLEX true)
@@ -129,7 +152,7 @@ endif ()
 
 if (BUILD_BFLOAT16)
   message(STATUS "Building Half Precision")
-  list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
+  #  list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
 endif ()
 
 if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
@@ -140,9 +163,10 @@ endif ()
 set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 if(MSVC)
-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
+  set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
+  set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
 endif ()
+
 # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
 set(TARGET_OBJS "")
 foreach (SUBDIR ${SUBDIRS})
@@ -180,12 +204,63 @@ if (${DYNAMIC_ARCH})
 endif ()
 
 # add objects to the openblas lib
-add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
-target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
+if(NOT NO_LAPACK)
+  add_library(LAPACK OBJECT ${LA_SOURCES})
+  list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
+endif()
+if(NOT NO_LAPACKE)
+  add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
+  list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
+endif()
+if(BUILD_RELAPACK)
+  add_library(RELAPACK OBJECT ${RELA_SOURCES})
+  list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
+endif()
+set(OpenBLAS_LIBS "")
+if(BUILD_STATIC_LIBS)
+  add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
+  target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
+  list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static)
+endif()
+if(BUILD_SHARED_LIBS)
+  add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
+  target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
+  list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared)
+endif()
+if(BUILD_STATIC_LIBS)
+  add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static)
+else()
+  add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared)
+endif()
+
+set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
 
 # Android needs to explicitly link against libm
 if(ANDROID)
-  target_link_libraries(${OpenBLAS_LIBNAME} m)
+  if(BUILD_STATIC_LIBS)
+    target_link_libraries(${OpenBLAS_LIBNAME}_static m)
+  endif()
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(${OpenBLAS_LIBNAME}_shared m)
+  endif()
+endif()
+
+if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
+  set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  if (NOT NOFORTRAN)  
+  set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
+ "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
+ "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+ "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
+ "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
+ "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
+  else ()
+  set (CMAKE_C_CREATE_SHARED_LIBRARY
+   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
+   "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
+  endif ()
 endif()
 
 # Handle MSVC exports
@@ -194,21 +269,21 @@ if(MSVC AND BUILD_SHARED_LIBS)
     include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
   else()
     # Creates verbose .def file (51KB vs 18KB)
-    set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
+    set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
   endif()
 endif()
 
 # Set output for libopenblas
-set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
-set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
-set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
+set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
+set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS")
 
 foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
   string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
 
-  set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
-  set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
-  set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
+  set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
+  set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
+  set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
 endforeach()
 
 enable_testing()
@@ -217,10 +292,17 @@ if (USE_THREAD)
   # Add threading library to linker
   find_package(Threads)
   if (THREADS_HAVE_PTHREAD_ARG)
-    set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread")
-    set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
+    set_target_properties(${OpenBLAS_LIBS} PROPERTIES
+      COMPILE_OPTIONS "-pthread"
+      INTERFACE_COMPILE_OPTIONS "-pthread"
+    )
+  endif()
+  if(BUILD_STATIC_LIBS)
+    target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT})
   endif()
-  target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
 #if (MSVC OR NOT NOFORTRAN)
@@ -229,104 +311,116 @@ if (NOT NO_CBLAS)
   add_subdirectory(utest)
 endif()
 
-if (NOT MSVC AND NOT NOFORTRAN)
+if (NOT NOFORTRAN)
   # Build test and ctest
   add_subdirectory(test)
   if(NOT NO_CBLAS)
     add_subdirectory(ctest)
   endif()
   add_subdirectory(lapack-netlib/TESTING)
-	  if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
-	  	add_subdirectory(cpp_thread_test)
-    	  endif()
+  if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
+    add_subdirectory(cpp_thread_test)
+  endif()
 endif()
 
-set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
+set_target_properties(${OpenBLAS_LIBS} PROPERTIES
   VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
   SOVERSION ${OpenBLAS_MAJOR_VERSION}
 )
 
 if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
   if (NOT MSVC)
-    target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
+    target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
   else()
-   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
   endif()
 endif()
 
 if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
-if (NOT DEFINED ARCH)
-  set(ARCH_IN "x86_64")
-else()
-  set(ARCH_IN ${ARCH})
-endif()
+  if (NOT DEFINED ARCH)
+    set(ARCH_IN "x86_64")
+  else()
+    set(ARCH_IN ${ARCH})
+  endif()
 
-if (${CORE} STREQUAL "generic")
-  set(ARCH_IN "GENERIC")
-endif ()
+  if (${CORE} STREQUAL "generic")
+    set(ARCH_IN "GENERIC")
+  endif ()
 
-if (NOT DEFINED EXPRECISION)
-  set(EXPRECISION_IN 0)
-else()
-  set(EXPRECISION_IN ${EXPRECISION})
-endif()
+  if (NOT DEFINED EXPRECISION)
+    set(EXPRECISION_IN 0)
+  else()
+    set(EXPRECISION_IN ${EXPRECISION})
+  endif()
 
-if (NOT DEFINED NO_CBLAS)
-  set(NO_CBLAS_IN 0)
-else()
-  set(NO_CBLAS_IN ${NO_CBLAS})
-endif()
+  if (NOT DEFINED NO_CBLAS)
+    set(NO_CBLAS_IN 0)
+  else()
+    set(NO_CBLAS_IN ${NO_CBLAS})
+  endif()
 
-if (NOT DEFINED NO_LAPACK)
-  set(NO_LAPACK_IN 0)
-else()
-  set(NO_LAPACK_IN ${NO_LAPACK})
-endif()
+  if (NOT DEFINED NO_LAPACK)
+    set(NO_LAPACK_IN 0)
+  else()
+    set(NO_LAPACK_IN ${NO_LAPACK})
+  endif()
 
-if (NOT DEFINED NO_LAPACKE)
-  set(NO_LAPACKE_IN 0)
-else()
-  set(NO_LAPACKE_IN ${NO_LAPACKE})
-endif()
+  if (NOT DEFINED NO_LAPACKE)
+    set(NO_LAPACKE_IN 0)
+  else()
+    set(NO_LAPACKE_IN ${NO_LAPACKE})
+  endif()
 
-if (NOT DEFINED NEED2UNDERSCORES)
-  set(NEED2UNDERSCORES_IN 0)
-else()
-  set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
-endif()
+  if (NOT DEFINED NEED2UNDERSCORES)
+    set(NEED2UNDERSCORES_IN 0)
+  else()
+    set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
+  endif()
 
-if (NOT DEFINED ONLY_CBLAS)
-  set(ONLY_CBLAS_IN 0)
-else()
-  set(ONLY_CBLAS_IN ${ONLY_CBLAS})
-endif()
+  if (NOT DEFINED ONLY_CBLAS)
+    set(ONLY_CBLAS_IN 0)
+  else()
+    set(ONLY_CBLAS_IN ${ONLY_CBLAS})
+  endif()
 
-if (NOT DEFINED BU)
-  set(BU _)
-endif()
+  if (NOT DEFINED BU)
+    set(BU _)
+  endif()
 
-if (NOT ${SYMBOLPREFIX} STREQUAL "")
-message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
-endif()
-if (NOT ${SYMBOLSUFFIX} STREQUAL "")
-message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
-endif()
-	add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
-  	COMMAND perl  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
-        COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
-        COMMENT "renaming symbols"
-        )
+  if (NOT ${SYMBOLPREFIX} STREQUAL "")
+    message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
+  endif()
+  if (NOT ${SYMBOLSUFFIX} STREQUAL "")
+    message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
+  endif()
+
+  add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
+    COMMAND perl  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
+    COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
+    COMMENT "renaming symbols"
+    )
 endif()
 
 
 # Install project
 
 # Install libraries
-install(TARGETS ${OpenBLAS_LIBNAME}
-	EXPORT "OpenBLAS${SUFFIX64}Targets"
-	RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-	ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
+if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
+  install(TARGETS ${OpenBLAS_LIBNAME}_shared
+    EXPORT "OpenBLAS${SUFFIX64}Targets"
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
+  install(TARGETS ${OpenBLAS_LIBNAME}_static
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
+else()
+  install(TARGETS ${OpenBLAS_LIBS}
+    EXPORT "OpenBLAS${SUFFIX64}Targets"
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
+endif()
 
 # Install headers
 set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
@@ -362,36 +456,41 @@ if(NOT NOFORTRAN)
 endif()
 
 if(NOT NO_CBLAS)
-	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
-	set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
-	file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
-	string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
-	if (NOT ${SYMBOLPREFIX} STREQUAL "")
-	string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
-	string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
-	string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
-	string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
-	endif()
-	if (NOT ${SYMBOLSUFFIX} STREQUAL "")
-	string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
-	string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
-	string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
-	string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
-	endif()
-	file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
-	install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
+  set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
+  string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+  if (NOT ${SYMBOLPREFIX} STREQUAL "")
+    string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+    string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
+    string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+    string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+  endif()
+  if (NOT ${SYMBOLSUFFIX} STREQUAL "")
+    string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+    string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
+    string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS	"${CBLAS_H_CONTENTS_NEW}")
+    string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW	"${CBLAS_H_CONTENTS}")
+  endif()
+  file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
+  install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
 
 if(NOT NO_LAPACKE)
-	message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
-	add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
-	FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
-	install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
-	ADD_CUSTOM_TARGET(genlapacke
-	COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
-	)
-	install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
+  message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
+  if(BUILD_STATIC_LIBS)
+    add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke)
+  endif()
+  if(BUILD_SHARED_LIBS)
+    add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke)
+  endif()
+  FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
+  install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+  ADD_CUSTOM_TARGET(genlapacke
+  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
+  )
+  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
 endif()
 
 # Install pkg-config files
@@ -416,4 +515,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
 install(EXPORT "${PN}${SUFFIX64}Targets"
         NAMESPACE "${PN}${SUFFIX64}::"
         DESTINATION ${CMAKECONFIG_INSTALL_DIR})
-
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index be9a32a7c..92be1fe42 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -194,3 +194,16 @@ In chronological order:
 
 * PingTouGe Semiconductor Co., Ltd.
   * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
+
+* River Dillon <oss@outerpassage.net>
+  * [2021-07-10] fix compilation with musl libc
+
+* Bine Brank <https://github.com/binebrank>
+  * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
+  * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
+  * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
+  * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
+  * [2022-01-18] SVE kernels and copy functions for TRSM
+
+* Ilya Kurdyukov <https://github.com/ilyakurdyukov>
+  * [2021-02-21] Add basic support for the Elbrus E2000 architecture
diff --git a/Changelog.txt b/Changelog.txt
index edd3563ec..97af4cbd9 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,340 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.20
+ 20-Feb-2022
+
+general:
+ - some code cleanup, with added casts etc.
+ - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset
+ - fixed pivot index calculation by ?LASWP for negative increments other than one
+ - fixed input argument check in LAPACK ? GEQRT2
+ - improved the check for a Fortran compiler in CMAKE builds
+ - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1
+ - fixed building of LAPACK on certain distributed filesystems with parallel gmake
+ - fixed building the shared library on MacOS with classic flang
+
+x86_64:
+ - fixed cross-compilation with CMAKE for CORE2 target
+ - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds
+ - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS
+
+E2K:
+ - add new architecture (Russian Elbrus E2000 family)
+
+SPARC:
+ - fix IMIN/IMAX
+
+ARMV8:
+ - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX
+ - added support for Neoverse N2 and V1 cpus
+
+MIPS,MIPS64:
+ - fixed autodetection of MSA capability
+
+LOONGARCH64:
+ - added an optimized DGEMM kernel
+
+====================================================================
+Version 0.3.19
+ 19-Dec-2021
+ 
+ general:
+ - reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16
+ - fixed a potential thread race in the thread buffer reallocation routines
+   that were introduced in 0.3.18
+ - fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE
+ - fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG
+ - made automatic library suffix for CMAKE builds with INTERFACE64 available
+   to CBLAS-only builds
+
+x86_64:
+ - DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities
+   when an unknown CPUID is encountered, instead of defaulting to Prescott
+ - added cpu detection for Intel Alder Lake
+ - added cpu detection for Intel Sapphire Rapids
+ - added an optimized SBGEMM kernel for Sapphire Rapids
+ - fixed DYNAMIC_ARCH builds on OSX with CMAKE
+ - worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX
+ - fixed missing thread initialization for static builds on Windows/MSVC
+ - fixed an excessive read in ZSYMV
+
+POWER:
+ - added support for POWER10 in big-endian mode
+ - added support for building with CMAKE
+ - added optimized SGEMM and DGEMM kernels for small matrix sizes
+
+ARMV8:
+ - added basic support and cputype detection for Fujitsu A64FX
+ - added a generic ARMV8SVE target  
+ - added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX
+ - added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus
+ - fixed cpuid detection for Apple M1 and improved performance
+ - improved compiler flag setting in CMAKE builds
+
+RISCV64:
+ - fixed improper initialization in CSCAL/ZSCAL for strided access patterns
+
+MIPS:
+ - added a GENERIC target for MIPS32
+ - added support for cross-compiling to MIPS32 on x86_64 using CMAKE
+
+MIPS64:
+ - fixed misdetection of MSA capability
+
+====================================================================
+Version 0.3.18
+ 02-Oct-2021
+
+general:
+ - when the build-time number of preconfigured threads is exceeded
+   at runtime (typically by an external program calling BLAS functions
+   from a larger number of threads in parallel), OpenBLAS will now 
+   allocate an auxiliary control structure for up to 512 additional
+   threads instead of aborting
+ - added support for Loongson's LoongArch64 cpu architecture
+ - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
+ - added support for building OpenBLAS as a CMAKE subproject
+ - added support for building for Windows/ARM64 targets with clang
+ - improved support for building with the IBM xlf compiler
+ - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
+ - imported Reference-LAPACK PR 597 for testsuite compatibility with
+   LLVM's libomp
+
+x86_64:
+ - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
+ - added optimized SBGEMM for Intel Cooper Lake
+ - reinstated the performance patch for AVX512 SGEMV_T with a proper fix
+ - added a workaround for a gcc11 tree-vectorizer bug that caused spurious
+   failures in the test programs for complex BLAS3 when compiling at -O3
+   (the default for cmake "release" builds)
+ - added support for runtime cpu count detection under Haiku OS
+ - worked around a long-standing miscompilation issue of the Haswell DGEMV_T
+   kernel with gcc that could produce NaN output in some corner cases
+
+POWER:
+ - improved performance of DASUM on POWER10
+
+ARMV8:
+ - fixed crashes (use of reserved register x18) on Apple M1 under OSX
+ - fixed building with gcc releases earlier than 5.1
+
+MIPS:
+ - fixed building under BSD
+
+MIPS64:
+ - fixed building under BSD
+
+====================================================================
+Version 0.3.17
+ 15-Jul-2021
+
+common:
+ - reverted the optimization of SGEMV_N/DGEMV_N for small input sizes
+   and consecutive arguments as it led to stack overflows on x86_64
+   with some operating systems (notably OSX and Windows)
+ 
+ x86_64:
+  - reverted the performance patch for SGEMV_T on AVX512 as it caused
+    wrong results in some applications
+    
+ SPARC:
+  - fixed compilation with compilers other than gcc
+====================================================================
+Version 0.3.16
+ 11-Jul-2021
+
+common:
+ - drastically reduced the stack size requirements for running the LAPACK
+   testsuite (Reference-LAPACK PR 553)
+ - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK
+   PR 564)
+ - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode 
+ - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N 
+   and DGEMV_N, for small input sizes and consecutive arguments
+ - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes
+   by disabling multithreading
+ - fixed installing with BSD versions of the "install" utility
+
+RISCV:
+ - fixed the implementation of xIMIN
+ - improved the performance of DSDOT
+ - fixed linking of the tests on C910V with current vendor gcc
+
+POWER:
+- fixed SBGEMM computation for some odd value inputs
+- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5
+
+x86_64:
+ - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus
+ - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc
+   versions
+ - fixed compilation with MS Visual Studio versions older than 2017
+ - fixed macro name collision with winnt.h from the latest Win10 SDK
+ - added cpu type autodetection for Intel Ice Lake SP
+ - fixed cpu type autodetection for Intel Tiger Lake
+ - added cpu type autodetection for recent Centaur/Zhaoxin models
+ - fixed compilation with musl libc
+
+ARM64:
+- fixed compilation with gcc/gfortran on the Apple M1
+- fixed linking of the tests on FreeBSD
+- fixed missing restore of a register in the recently rewritten DNRM2 kernel
+  for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g.
+  DGEEV
+- added compiler optimization flags for the EMAG8180
+- added initial support for Cortex A55
+
+ARM:
+- fixed linking of the tests on FreeBSD
+
+====================================================================
+Version 0.3.15
+  2-May-2021
+
+common:
+ - imported improvements and bugfixes from Reference-LAPACK 3.9.1
+ - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
+ - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
+ - fixed a sequence problem in the generation of softlinks to the library in GMAKE
+
+RISC V:
+ - fixed compilation on RISCV (missing entry in getarch)
+ - fixed a potential division by zero in CROTG and ZROTG
+
+POWER:
+ - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
+ - improved CGEMM, DGEMM and ZGEMM performance on POWER10
+ - added an optimized ZGEMV kernel for POWER10
+ - fixed a potential division by zero in CROTG and ZROTG
+
+x86_64:
+ - added support for Intel Control-flow Enforcement Technology (CET)
+ - reverted the DOMATCOPY_RT code to the generic C version
+ - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
+ - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
+ - added support for compilation of the benchmarks on older OSX versions
+ - fix propagation of the NO_AVX512 option in CMAKE builds
+ - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
+ - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
+ - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
+
+ARM:
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
+
+ARM64:
+ - fixed spurious reads outside the array in the SGEMM tcopy macro
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
+
+MIPS
+ - fixed a potential division by zero in CROTG and ZROTG
+ - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
+
+MIPS64:
+ - fixed a potential division by zero in CROTG and ZROTG
+
+SPARC:
+ - fixed a potential division by zero in CROTG and ZROTG
+
+====================================================================
+Version 0.3.14
+ 17-Mar-2021
+ 
+ common:
+	* Fixed a race condition on thread shutdown in non-OpenMP builds
+	* Fixed custom BUFFERSIZE option getting ignored in gmake builds
+	* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
+	* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
+	* Improved performance of OMATCOPY_RT across all platforms
+	* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
+	* Fixed potential misreading of the GCC compiler version in the build scripts
+	* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
+	* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
+
+ RISCV:
+ 	* Fixed compilation on RISCV (missing entry in getarch)
+	
+ POWER:
+ 	* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
+	* Added support for compilation on FreeBSD/ppc64le
+ 	* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
+	* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
+	* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
+	* Improved SCOPY and CCOPY performance on POWER10
+	* Improved SGEMM and DGEMM performance on POWER10
+	* Added support for compilation with the NVIDIA HPC compiler
+
+ x86_64:
+	* Added an optimized bfloat16 GEMM kernel for Cooperlake
+	* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
+	* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
+	* Added support for compilation with the NAG Fortran compiler
+	* Fixed recognition of the AMD AOCC compiler
+	* Fixed compilation for DYNAMIC_ARCH with clang on Windows
+	* Added support for running the BLAS/CBLAS tests on Windows
+	* Fixed signatures of the tls callback functions for Windows x64
+	* Fixed various issues with fma intrinsics support handling 
+
+ ARM:
+	* Added support for embedded Cortex M targets via a new option EMBEDDED
+
+ ARMV8:
+ 	* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
+	* Added support for the DYNAMIC_LIST option
+	* Added support for compilation with the NVIDIA HPC compiler
+ 	* Added support for compiling with the NAG Fortran compiler
+
+====================================================================
+Version 0.3.13
+ 12-Dec-2020
+ 
+ common:
+	* Added a generic bfloat16 SBGEMV kernel
+	* Fixed a potentially severe memory leak after fork in OpenMP builds
+	  that was introduced in 0.3.12
+	* Added detection of the Fujitsu Fortran compiler
+	* Added detection of the (e)gfortran compiler on OpenBSD
+	* Added support for overriding the default name of the library independently
+	  from symbol suffixing in the gmake builds (already supported in cmake)
+
+RISCV:
+	* Added a RISC V port optimized for C910V
+
+POWER:
+	* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
+	* Improved DGEMM performance on POWER10
+	* Improved STRSM and DTRSM performance on POWER9 and POWER10
+	* Fixed segmemtation faults in DYNAMIC_ARCH builds
+ 	* Fixed compilation with the PGI compiler
+
+x86:
+	* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
+	
+x86_64:
+	* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
+	* Improved the performance of SASUM and DASUM kernels through parallelization
+	* Improved the performance of SROT and DROT kernels
+	* Improved the performance of multithreaded xSYRK
+	* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
+	  (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
+	  wrong results)
+	* Fixed miscompilations by old gcc 4.6
+	* Fixed misdetection of AVX2 capability in some Sandybridge cpus
+	* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
+
+ARM64:
+	* Fixed segmemtation faults in DYNAMIC_ARCH builds
+
+MIPS:
+	* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
+	* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
+	* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
+	* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
+
+SPARC:
+	* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
+
 ====================================================================
 Version 0.3.12
  24-Oct-2020
diff --git a/Makefile b/Makefile
index 54dd3be41..1bb3f6b90 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ export NOFORTRAN
 export NO_LAPACK
 endif
 
-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
+LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 
@@ -59,6 +59,9 @@ endif
 	@$(CC) --version > /dev/null 2>&1;\
 	if [ $$? -eq 0 ]; then \
 	   cverinfo=`$(CC) --version | sed -n '1p'`; \
+	   if [ -z "$${cverinfo}" ]; then \
+	   cverinfo=`$(CC) --version | sed -n '2p'`; \
+	   fi; \
 	   echo "  C compiler       ... $(C_COMPILER)  (cmd & version : $${cverinfo})";\
 	else  \
 	   echo "  C compiler       ... $(C_COMPILER)  (command line : $(CC))";\
@@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	@$(FC) --version > /dev/null 2>&1;\
 	if [ $$? -eq 0 ]; then \
 	   fverinfo=`$(FC) --version | sed -n '1p'`; \
+	   if [ -z "$${fverinfo}" ]; then \
+	   fverinfo=`$(FC) --version | sed -n '2p'`; \
+	   fi; \
 	   echo "  Fortran compiler ... $(F_COMPILER)  (cmd & version : $${fverinfo})";\
 	else \
 	   echo "  Fortran compiler ... $(F_COMPILER)  (command line : $(FC))";\
@@ -161,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
 	$(error OpenBLAS: neither static nor shared are enabled.)
 endif
 endif
-	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@for d in $(SUBDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -190,6 +195,7 @@ endif
 ifdef USE_THREAD
 	@echo USE_THREAD=$(USE_THREAD) >>  Makefile.conf_last
 endif
+	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@touch lib.grd
 
 prof : prof_blas prof_lapack
@@ -263,7 +269,7 @@ prof_lapack : lapack_prebuild
 lapack_prebuild :
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	-@echo "FC          = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "override FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_DRV  = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
diff --git a/Makefile.arm64 b/Makefile.arm64
index 62a877fff..2eade8d78 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -1,80 +1,234 @@
+ifneq ($(C_COMPILER), PGI)
+
+ifeq ($(C_COMPILER), CLANG)
+ISCLANG=1
+endif
+ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8-a
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a
+endif
+
+
+else 
+
 
 ifeq ($(CORE), ARMV8)
 CCOMMON_OPT += -march=armv8-a
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a
 endif
+endif
+
+ifeq ($(CORE), ARMV8SVE)
+CCOMMON_OPT += -march=armv8-a+sve
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a+sve
+endif
+endif
 
 ifeq ($(CORE), CORTEXA53)
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
 endif
+endif
 
 ifeq ($(CORE), CORTEXA57)
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
 endif
+endif
 
 ifeq ($(CORE), CORTEXA72)
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
+endif
 
 ifeq ($(CORE), CORTEXA73)
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
 endif
+endif
 
 # Use a72 tunings because Neoverse-N1 is only available
 # in GCC>=9
 ifeq ($(CORE), NEOVERSEN1)
-ifeq ($(GCCVERSIONGTEQ7), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
 ifeq ($(GCCVERSIONGTEQ9), 1)
 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+endif
+
+# Use a72 tunings because Neoverse-V1 is only available
+# in GCC>=9.4
+ifeq ($(CORE), NEOVERSEV1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+endif
+else
+CCOMMON_OPT += -march=armv8.4-a -mtune=native
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=native
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+endif
+
+# Use a72 tunings because Neoverse-N2 is only available
+# in GCC>=9.4
+ifeq ($(CORE), NEOVERSEN2)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
+endif
+else
+CCOMMON_OPT += -march=armv8.5-a -mtune=native
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.5-a -mtune=native
+endif
+endif
 else
 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
+endif
 else
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
 endif
+endif
+
+# Use a53 tunings because a55 is only available in GCC>=8.1
+ifeq ($(CORE), CORTEXA55)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ8), 1)
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+endif
+endif
+endif
 
 ifeq ($(CORE), THUNDERX)
 CCOMMON_OPT += -march=armv8-a -mtune=thunderx
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=thunderx
 endif
+endif
 
 ifeq ($(CORE), FALKOR)
 CCOMMON_OPT += -march=armv8-a -mtune=falkor
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=falkor
 endif
+endif
 
 ifeq ($(CORE), THUNDERX2T99)
 CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 endif
+endif
 
 ifeq ($(CORE), THUNDERX3T110)
 ifeq ($(GCCVERSIONGTEQ10), 1)
 CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+endif
 else
 CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 endif
 endif
+endif
 
 ifeq ($(CORE), VORTEX)
 CCOMMON_OPT += -march=armv8.3-a
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a
 endif
+endif
 
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 ifeq ($(CORE), TSV110)
 CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
 endif
 endif
+endif
 
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq ($(CORE), EMAG8180)
+CCOMMON_OPT += -march=armv8-a -mtune=emag
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=emag
+endif
+endif
+endif
+
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
+ifeq ($(CORE), A64FX)
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
+endif
+endif
+endif
+
+endif
+
+endif
diff --git a/Makefile.e2k b/Makefile.e2k
new file mode 100644
index 000000000..a5e50b1f0
--- /dev/null
+++ b/Makefile.e2k
@@ -0,0 +1 @@
+COPT	= -Wall -O2 # -DGEMMTEST
diff --git a/Makefile.install b/Makefile.install
index e8b64465f..28727de37 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -74,17 +74,17 @@ endif
 ifneq ($(OSNAME), AIX)
 ifndef NO_LAPACKE
 	@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
-	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
-	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
-	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
-	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
-	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
+	@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
+	@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
+	@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
+	@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
+	@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
 endif
 
 #for install static library
 ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
-	@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+	@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
 	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
@@ -92,7 +92,7 @@ endif
 ifneq ($(NO_SHARED),1)
 	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
-	@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
+	@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
 	ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
diff --git a/Makefile.loongarch64 b/Makefile.loongarch64
new file mode 100644
index 000000000..05ea9c679
--- /dev/null
+++ b/Makefile.loongarch64
@@ -0,0 +1,3 @@
+ifdef BINARY64
+else
+endif
diff --git a/Makefile.power b/Makefile.power
index c7e972290..28a0bae08 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -10,9 +10,15 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
+ifneq ($(C_COMPILER), PGI)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
+endif
+endif
 
 ifeq ($(CORE), POWER9)
 ifneq ($(C_COMPILER), PGI)
@@ -31,7 +37,11 @@ else
 CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O2 -frecursive -fno-fast-math
+endif
 ifeq ($(C_COMPILER), GCC)
 ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -55,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
 ifeq ($(OSNAME), AIX)
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+endif
 else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
 endif
diff --git a/Makefile.prebuild b/Makefile.prebuild
index d6395da7b..399db956f 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -3,6 +3,10 @@
 export BINARY
 export USE_OPENMP
 
+ifdef DYNAMIC_ARCH
+override HOST_CFLAGS += -DDYNAMIC_ARCH
+endif
+
 ifdef TARGET_CORE
 TARGET_MAKE = Makefile_kernel.conf
 TARGET_CONF = config_kernel.h
diff --git a/Makefile.rule b/Makefile.rule
index 1a0965d08..ea093bce6 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.12.dev
+VERSION = 0.3.20
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.system b/Makefile.system
index c17cd3bd1..438a8148a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -9,11 +9,10 @@ ifndef TOPDIR
 TOPDIR = . 
 endif
 
- # If ARCH is not set, we use the host system's architecture for getarch compile options.
-ifndef ARCH
+# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
 HOSTARCH := $(shell uname -m)
-else
-HOSTARCH = $(ARCH)
+ifeq ($(HOSTARCH), amd64)
+HOSTARCH=x86_64
 endif
 
 # Catch conflicting usage of ARCH in some BSD environments
@@ -21,6 +20,8 @@ ifeq ($(ARCH), amd64)
 override ARCH=x86_64
 else ifeq ($(ARCH), powerpc64)
 override ARCH=power
+else ifeq ($(ARCH), powerpc64le)
+override ARCH=power
 else ifeq ($(ARCH), powerpc)
 override ARCH=power
 else ifeq ($(ARCH), i386)
@@ -31,6 +32,10 @@ else ifeq ($(ARCH), armv7)
 override ARCH=arm
 else ifeq ($(ARCH), aarch64)
 override ARCH=arm64
+else ifeq ($(ARCH), mipsel)
+override ARCH=mips
+else ifeq ($(ARCH), mips64el)
+override ARCH=mips64
 else ifeq ($(ARCH), zarch)
 override ARCH=zarch
 endif
@@ -96,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET
 ifeq ($(TARGET), GENERIC)
 ifeq ($(DYNAMIC_ARCH), 1)
 override NO_EXPRECISION=1
-export NO_EXPRECiSION
+export NO_EXPRECISION
 endif
 endif
 endif
@@ -113,6 +118,9 @@ endif
 ifeq ($(TARGET), COOPERLAKE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
+ifeq ($(TARGET), SAPPHIRERAPIDS)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
 ifeq ($(TARGET), SANDYBRIDGE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
@@ -137,8 +145,13 @@ endif
 ifeq ($(TARGET), POWER8)
 GETARCH_FLAGS := -DFORCE_POWER6
 endif
+ifeq ($(TARGET), POWER9)
+GETARCH_FLAGS := -DFORCE_POWER6
+endif
+ifeq ($(TARGET), POWER10)
+GETARCH_FLAGS := -DFORCE_POWER6
+endif
 endif
-
 
 #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
 #
@@ -158,6 +171,9 @@ endif
 ifeq ($(TARGET_CORE), COOPERLAKE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
+ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
+GETARCH_FLAGS := -DFORCE_NEHALEM
+endif
 ifeq ($(TARGET_CORE), SANDYBRIDGE)
 GETARCH_FLAGS := -DFORCE_NEHALEM
 endif
@@ -181,7 +197,7 @@ endif
 
 # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
 ifeq ($(HOSTARCH), x86_64)
-ifeq ($(findstring pgcc,$(HOSTCC)),)
+ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
 GETARCH_FLAGS += -march=native
 endif
 endif
@@ -242,12 +258,26 @@ else
 ONLY_CBLAS = 0
 endif
 
+#For small matrix optimization
+ifeq ($(ARCH), x86_64)
+SMALL_MATRIX_OPT = 1
+else ifeq ($(CORE), POWER10)
+SMALL_MATRIX_OPT = 1
+endif
+ifeq ($(SMALL_MATRIX_OPT), 1)
+CCOMMON_OPT += -DSMALL_MATRIX_OPT
+endif
+
 # This operation is expensive, so execution should be once.
 ifndef GOTOBLAS_MAKEFILE
 export GOTOBLAS_MAKEFILE = 1
 
+# Determine if the assembler is GNU Assembler
+HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
+GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
+
 # Generating Makefile.conf and config.h
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
+DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
 
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
@@ -293,7 +323,7 @@ else
 SMP = 1
 endif
 else
-ifeq ($(NUM_THREAD), 1)
+ifeq ($(NUM_THREADS), 1)
 SMP =
 else
 SMP = 1
@@ -331,6 +361,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
 GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
 GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
 GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
+GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
 GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
 GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
 GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
@@ -343,6 +374,7 @@ else
 endif
 GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
 GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
+GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
 
@@ -378,6 +410,12 @@ ifeq ($(OSNAME), AIX)
 EXTRALIB	+= -lm
 endif
 
+ifeq ($(OSNAME), FreeBSD)
+ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
+EXTRALIB	+= -lm
+endif
+endif
+
 ifeq ($(OSNAME), WINNT)
 NEED_PIC = 0
 NO_EXPRECISION = 1
@@ -617,12 +655,24 @@ DYNAMIC_CORE += CORTEXA57
 DYNAMIC_CORE += CORTEXA72
 DYNAMIC_CORE += CORTEXA73
 DYNAMIC_CORE += NEOVERSEN1
+DYNAMIC_CORE += NEOVERSEV1
+DYNAMIC_CORE += NEOVERSEN2
+DYNAMIC_CORE += CORTEXA55
 DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
 DYNAMIC_CORE += EMAG8180
 DYNAMIC_CORE += THUNDERX3T110
+ifdef DYNAMIC_LIST
+override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
+XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
+XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
+endif
+endif
+
+ifeq ($(ARCH), mips64)
+DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
 endif
 
 ifeq ($(ARCH), zarch)
@@ -659,6 +709,7 @@ endif
 endif # ARCH zarch
 
 ifeq ($(ARCH), power)
+ifneq ($(C_COMPILER), PGI)
 DYNAMIC_CORE = POWER6
 DYNAMIC_CORE += POWER8
 ifneq ($(C_COMPILER), GCC)
@@ -672,7 +723,7 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
+LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
 ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
 CCOMMON_OPT += -DHAVE_P10_SUPPORT
@@ -685,6 +736,10 @@ else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
 endif
 endif
+else
+DYNAMIC_CORE = POWER8
+DYNAMIC_CORE += POWER9
+endif
 endif
 
 # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@@ -756,6 +811,11 @@ NO_BINARY_MODE  = 1
 BINARY_DEFINED  = 1
 endif
 
+ifeq ($(ARCH), loongarch64)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
+
 
 #
 #  C Compiler dependent settings
@@ -787,14 +847,9 @@ CCOMMON_OPT += -mabi=32
 BINARY_DEFINED = 1
 endif
 
-ifeq ($(CORE), LOONGSON3A)
-CCOMMON_OPT += -march=mips64
-FCOMMON_OPT += -march=mips64
-endif
-
-ifeq ($(CORE), LOONGSON3B)
-CCOMMON_OPT += -march=mips64
-FCOMMON_OPT += -march=mips64
+ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
+CCOMMON_OPT += -march=loongson3a
+FCOMMON_OPT += -march=loongson3a
 endif
 
 ifeq ($(CORE), MIPS24K)
@@ -831,6 +886,13 @@ ifeq ($(OSNAME), AIX)
 BINARY_DEFINED = 1
 endif
 
+ifeq ($(ARCH), loongarch64)
+ifeq ($(CORE), LOONGSON3R5)
+CCOMMON_OPT += -march=loongarch64 -mabi=lp64
+FCOMMON_OPT += -march=loongarch64 -mabi=lp64
+endif
+endif
+
 endif
 
 ifndef BINARY_DEFINED
@@ -848,9 +910,29 @@ endif
 endif
 
 ifeq ($(C_COMPILER), PGI)
+PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
+PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
+PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
+PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
+ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
+NEWPGI := 1
+PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
+PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
+PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
+ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
+NEWPGI2 := 1
+endif
+endif
 ifdef BINARY64
 ifeq ($(ARCH), x86_64)
-CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
+ifneq ($(NEWPGI2),1)
+CCOMMON_OPT += -tp p7-64
+else
+CCOMMON_OPT += -tp px
+endif
+ifneq ($(NEWPGI),1)
+CCOMMON_OPT +=  -D__MMX__ -Mnollvm
+endif
 else
 ifeq ($(ARCH), power)
 ifeq ($(CORE), POWER8)
@@ -862,7 +944,11 @@ endif
 endif
 endif
 else
+ifneq ($(NEWPGI2),1)
 CCOMMON_OPT += -tp p7
+else
+CCOMMON_OPT += -tp px
+endif
 endif
 endif
 
@@ -878,13 +964,25 @@ endif
 #  Fortran Compiler dependent settings
 #
 
+ifeq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
+ifdef INTERFACE64
+ifneq ($(INTERFACE64), 0)
+FCOMMON_OPT += -i8
+endif
+endif
+ifeq ($(USE_OPENMP), 1)
+FCOMMON_OPT += -openmp
+endif
+endif
+
 ifeq ($(F_COMPILER), FLANG)
 CCOMMON_OPT += -DF_INTERFACE_FLANG
 FCOMMON_OPT += -Mrecursive -Kieee
 ifeq ($(OSNAME), Linux)
 ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
-ifeq ($(FLANG_VENDOR),AOCC)
+FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
+ifeq ($(FLANG_VENDOR), AMD)
 FCOMMON_OPT += -fno-unroll-loops
 endif
 endif
@@ -1027,21 +1125,31 @@ FCOMMON_OPT += -i8
 endif
 endif
 ifeq ($(ARCH), x86_64)
+ifneq ($(NEWPGI2),1)
 FCOMMON_OPT += -tp p7-64
 else
+FCOMMON_OPT += -tp px
+endif
+else
 ifeq ($(ARCH), power)
+ifeq ($(CORE), POWER6)
+$(warning NVIDIA HPC compilers do not support POWER6.)
+endif
 ifeq ($(CORE), POWER8)
 FCOMMON_OPT += -tp pwr8
 endif
 ifeq ($(CORE), POWER9)
 FCOMMON_OPT += -tp pwr9
 endif
+ifeq ($(CORE), POWER10)
+$(warning NVIDIA HPC compilers do not support POWER10.)
+endif
 endif
 endif
 else
 FCOMMON_OPT += -tp p7
 endif
-FCOMMON_OPT += -Mrecursive
+FCOMMON_OPT += -Mrecursive -Kieee
 ifeq ($(USE_OPENMP), 1)
 FCOMMON_OPT += -mp
 endif
@@ -1078,11 +1186,11 @@ FCOMMON_OPT += -n32
 else
 FCOMMON_OPT += -n64
 endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3R3)
 FCOMMON_OPT += -loongson3 -static
 endif
 
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3R4)
 FCOMMON_OPT += -loongson3 -static
 endif
 
@@ -1108,11 +1216,11 @@ CCOMMON_OPT += -n32
 else
 CCOMMON_OPT += -n64
 endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3R3)
 CCOMMON_OPT += -loongson3 -static
 endif
 
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3R4)
 CCOMMON_OPT += -loongson3 -static
 endif
 
@@ -1180,6 +1288,8 @@ CCOMMON_OPT += -fPIC
 endif
 ifeq ($(F_COMPILER), SUN)
 FCOMMON_OPT  += -pic
+else ifeq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -PIC
 else
 FCOMMON_OPT += -fPIC
 endif
@@ -1223,10 +1333,8 @@ ifdef SMP
 CCOMMON_OPT	+= -DSMP_SERVER
 
 ifeq ($(ARCH), mips64)
-ifneq ($(CORE), LOONGSON3B)
 USE_SIMPLE_THREADED_LEVEL3 = 1
 endif
-endif
 
 ifeq ($(USE_OPENMP), 1)
 # USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -1259,6 +1367,10 @@ CCOMMON_OPT	 += -DUSE_PAPI
 EXTRALIB	 += -lpapi -lperfctr
 endif
 
+ifdef BUFFERSIZE
+CCOMMON_OPT	 += -DBUFFERSIZE=$(BUFFERSIZE)
+endif
+
 ifdef DYNAMIC_THREADS
 CCOMMON_OPT	 += -DDYNAMIC_THREADS
 endif
@@ -1342,11 +1454,9 @@ endif
 
 ifneq ($(ARCH), x86_64)
 ifneq ($(ARCH), x86)
-ifneq ($(CORE), LOONGSON3B)
 NO_AFFINITY = 1
 endif
 endif
-endif
 
 ifdef NO_AFFINITY
 ifeq ($(NO_AFFINITY), 0)
@@ -1438,6 +1548,10 @@ LAPACK_FFLAGS := $(FFLAGS)
 LAPACK_FPFLAGS := $(FPFLAGS)
 endif
 
+ifeq ($(F_COMPILER),NAG)
+LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
+endif
+
 LAPACK_CFLAGS = $(CFLAGS)
 LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
 ifdef INTERFACE64
@@ -1566,8 +1680,10 @@ export HAVE_VFP
 export HAVE_VFPV3
 export HAVE_VFPV4
 export HAVE_NEON
-export HAVE_MSA
-export MSA_FLAGS
+ifndef NO_MSA
+  export HAVE_MSA
+  export MSA_FLAGS
+endif
 export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
diff --git a/Makefile.x86 b/Makefile.x86
index 0e27264d8..25ca660bd 100644
--- a/Makefile.x86
+++ b/Makefile.x86
@@ -1,10 +1,21 @@
 # COMPILER_PREFIX = mingw32-
 
+ifneq ($(DYNAMIC_ARCH),1)
+ADD_CPUFLAGS = 1
+else
+ifdef TARGET_CORE
+ADD_CPUFLAGS = 1
+endif
+endif
+
+ifdef ADD_CPUFLAGS
 ifdef HAVE_SSE
 CCOMMON_OPT += -msse
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -msse
 endif
-
+endif
+endif
 
 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x86
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 00967bcb6..f14a8a8ff 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -8,42 +8,57 @@ endif
 endif
 endif
 
+
+ifneq ($(DYNAMIC_ARCH),1)
+ADD_CPUFLAGS = 1
+else
+ifdef TARGET_CORE
+ADD_CPUFLAGS = 1
+endif
+endif
+
+ifdef ADD_CPUFLAGS
 ifdef HAVE_SSE3
 CCOMMON_OPT += -msse3
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -msse3
 endif
+endif
 ifdef HAVE_SSSE3
 CCOMMON_OPT += -mssse3
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -mssse3
 endif
+endif
 ifdef HAVE_SSE4_1
 CCOMMON_OPT += -msse4.1
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -msse4.1
 endif
+endif
 ifndef OLDGCC
 ifdef HAVE_AVX
 CCOMMON_OPT += -mavx
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -mavx
 endif
 endif
+endif
 ifndef NO_AVX2
 ifdef HAVE_AVX2
 CCOMMON_OPT += -mavx2
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -mavx2
 endif
 endif
-ifndef OLDGCC
-ifdef HAVE_FMA3
-CCOMMON_OPT += -mfma
-FCOMMON_OPT += -mfma
-endif
 endif
 
 ifeq ($(CORE), SKYLAKEX)
-ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 CCOMMON_OPT += -march=skylake-avx512
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=skylake-avx512
+endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
 FCOMMON_OPT += -fno-asynchronous-unwind-tables
@@ -56,17 +71,22 @@ endif
 endif
 endif
 endif
-endif
 
 ifeq ($(CORE), COOPERLAKE)
-ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
 CCOMMON_OPT += -march=cooperlake
+ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=cooperlake
 endif
+else  # gcc not support, fallback to avx512
+CCOMMON_OPT += -march=skylake-avx512
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=skylake-avx512
+endif
+endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
@@ -80,6 +100,34 @@ endif
 endif
 endif
 endif
+
+ifeq ($(CORE), SAPPHIRERAPIDS)
+ifndef NO_AVX512
+ifeq ($(C_COMPILER), GCC)
+# sapphire rapids support was added in 11
+ifeq ($(GCCVERSIONGTEQ11), 1)
+CCOMMON_OPT += -march=sapphirerapids
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=sapphirerapids
+endif
+else  # gcc not support, fallback to avx512
+CCOMMON_OPT += -march=skylake-avx512
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=skylake-avx512
+endif
+endif
+endif
+ifeq ($(OSNAME), CYGWIN_NT)
+CCOMMON_OPT += -fno-asynchronous-unwind-tables
+FCOMMON_OPT += -fno-asynchronous-unwind-tables
+endif
+ifeq ($(OSNAME), WINNT)
+ifeq ($(C_COMPILER), GCC)
+CCOMMON_OPT += -fno-asynchronous-unwind-tables
+FCOMMON_OPT += -fno-asynchronous-unwind-tables
+endif
+endif
+endif
 endif
 
 ifdef HAVE_AVX2
@@ -112,6 +160,7 @@ endif
 endif
 endif
 
+endif
 
 
 ifeq ($(OSNAME), Interix)
diff --git a/README.md b/README.md
index 267df5358..6ce85e08e 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
-Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
+Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS)
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
@@ -13,17 +13,21 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta
 
 ## Introduction
 
-OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
+OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
 
 Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
 
+For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
+<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
+20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
+
 ## Binary Packages
 
 We provide official binary packages for the following platform:
 
   * Windows x86/x86_64
 
-You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
+You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
 
 ## Installation from Source
 
@@ -124,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
 - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
 - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
+- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
 - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@@ -149,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 
 - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
 - **Cortex-A53**: same as ARMV8 (different cpu specifications)
+- **Cortex-A55**: same as ARMV8 (different cpu specifications)
 - **Cortex A57**: Optimized Level-3 and Level-2 functions
 - **Cortex A72**: same as A57 ( different cpu specifications)
 - **Cortex A73**: same as A57 (different cpu specifications)
@@ -174,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 
 #### RISC-V
 
-- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
+- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
   ```sh
   make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
   ```
+  (also known to work on C906)
 
 ### Support for multiple targets in a single library
 
@@ -208,7 +215,8 @@ Please note that it is not possible to combine support for different architectur
 - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
 - **AIX**: Supported on PPC up to POWER8
 - **Haiku**: Supported by the community. We don't actively test the library on this OS.
-- **SunOS**: Supported by the community. We don't actively test the library on this OS:
+- **SunOS**: Supported by the community. We don't actively test the library on this OS.
+- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
 
 ## Usage
 
diff --git a/TargetList.txt b/TargetList.txt
index d19964916..a5a07a661 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -23,6 +23,7 @@ HASWELL
 SKYLAKEX
 ATOM
 COOPERLAKE
+SAPPHIRERAPIDS
 
 b)AMD CPU:
 ATHLON
@@ -92,6 +93,9 @@ CORTEXA57
 CORTEXA72
 CORTEXA73
 NEOVERSEN1
+NEOVERSEV1
+NEOVERSEN2
+CORTEXA55
 EMAG8180
 FALKOR
 THUNDERX
@@ -109,3 +113,9 @@ Z14
 RISCV64_GENERIC
 C910V
 
+11.LOONGARCH64:
+LOONGSON3R5
+
+12. Elbrus E2000:
+E2K
+
diff --git a/appveyor.yml b/appveyor.yml
index 1936059d5..96a967387 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -29,15 +29,15 @@ environment:
   global:
     CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
   matrix:
-    - COMPILER: clang-cl
-      WITH_FORTRAN: yes
-    - COMPILER: clang-cl
-      DYNAMIC_ARCH: ON
-      WITH_FORTRAN: no
-    - COMPILER: cl
-    - COMPILER: MinGW64-gcc-7.2.0-mingw
-      DYNAMIC_ARCH: OFF
-      WITH_FORTRAN: ignore
+#    - COMPILER: clang-cl
+#      WITH_FORTRAN: ON
+#    - COMPILER: clang-cl
+#      DYNAMIC_ARCH: ON
+#      WITH_FORTRAN: OFF
+#    - COMPILER: cl
+#    - COMPILER: MinGW64-gcc-7.2.0-mingw
+#      DYNAMIC_ARCH: OFF
+#      WITH_FORTRAN: ignore
     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
       COMPILER: MinGW-gcc-6.3.0-32   
     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
@@ -46,13 +46,10 @@ environment:
     
 install:
   - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
+  - if [%COMPILER%]==[clang-cl] conda update --yes -n base conda
   - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
-  - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
-
-  - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
-  - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
-  - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
-
+  - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
+  - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
   - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
   - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
   - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
@@ -68,15 +65,14 @@ before_build:
   - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
   - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
   - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
-  - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
-  - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
+  - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON ..
+  - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
+  - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
   - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
 
 build_script:
   - cmake --build .
 
 test_script:
-  - echo Running Test
-  - cd utest
-  - openblas_utest
+  - ctest -j2
   
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 639cb3558..04ed428de 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -4,14 +4,22 @@ trigger:
   branches:
     include:
       - develop
-
+resources:
+  containers:
+      - container: oneapi-hpckit
+        image: intel/oneapi-hpckit:latest
+        options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
+      - container: oneapi-basekit
+        image: intel/oneapi-basekit:latest
+        options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
+ 
 jobs:
 # manylinux1 is useful to test because the
 # standard Docker container uses an old version
 # of gcc / glibc
 - job: manylinux1_gcc
   pool:
-    vmImage: 'ubuntu-16.04'
+    vmImage: 'ubuntu-latest'
   steps:
   - script: |
       echo "FROM quay.io/pypa/manylinux1_x86_64
@@ -27,7 +35,7 @@ jobs:
     displayName: Run manylinux1 docker build
 - job: Intel_SDE_skx
   pool:
-    vmImage: 'ubuntu-16.04'
+    vmImage: 'ubuntu-latest'
   steps:
   - script: |
       # at the time of writing the available Azure Ubuntu vm image
@@ -67,5 +75,189 @@ jobs:
       cd utest
       dir
       openblas_utest.exe
-  
+
+- job: Windows_mingw_gmake
+  pool:
+     vmImage: 'windows-latest'
+  steps:   
+  - script: |
+      mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
+
+- job: Windows_clang_cmake
+  pool:
+     vmImage: 'windows-latest'
+  steps:
+  - script: |
+      set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
+      set "LIB=C:\Miniconda\Library\lib;%LIB%"
+      set "CPATH=C:\Miniconda\Library\include;%CPATH%
+      conda config --add channels conda-forge --force
+      conda config --set auto_update_conda false
+      conda install --yes ninja
+      call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+      mkdir build 
+      cd build
+      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON ..
+      cmake --build . --config Release
+      ctest
+
+- job: Windows_flang_clang
+  pool:
+     vmImage: 'windows-latest'
+  steps:
+  - script: |
+      set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
+      set "LIB=C:\Miniconda\Library\lib;%LIB%"
+      set "CPATH=C:\Miniconda\Library\include;%CPATH%"
+      conda config --add channels conda-forge --force
+      conda config --set auto_update_conda false
+      conda install --yes --quiet ninja flang
+      mkdir build 
+      cd build
+      call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
+      cmake --build . --config Release
+      ctest
+
+- job: OSX_OpenMP
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: |
+      brew update
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
+      make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
+      ls -lR ../blasinst
+     
+- job: OSX_GCC_Nothreads
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: |
+      brew update
+      make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
+     
+- job: OSX_OpenMP_Clang
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
+
+- job: OSX_OpenMP_Clang_cmake
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      mkdir build
+      cd build
+      cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 ..
+      make
+      ctest
+      
+- job: OSX_dynarch_cmake
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+  steps:   
+  - script: |
+      mkdir build
+      cd build
+      cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
+      cmake --build .
+      ctest
       
+- job: OSX_Ifort_Clang
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
+     MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
+     LIBRARY_PATH: /usr/local/opt/llvm/lib
+     MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
+  steps:   
+  - script: |
+      brew update
+      brew install llvm libomp
+      sudo mkdir -p /opt/intel
+      sudo chown $USER /opt/intel
+    displayName: prepare for cache restore
+  - task: Cache@2
+    inputs:
+      path: /opt/intel/oneapi
+      key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
+      cacheHitVar: CACHE_RESTORED
+  - script: | 
+      curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
+      hdiutil attach webimage.dmg
+      sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
+      installer_exit_code=$?
+      hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
+      exit $installer_exit_code
+    displayName: install
+    condition: ne(variables.CACHE_RESTORED, 'true')
+  - script: | 
+      source /opt/intel/oneapi/setvars.sh
+      make CC=/usr/local/opt/llvm/bin/clang FC=ifort
+ 
+- job: OSX_NDK_ARMV7
+  pool:
+     vmImage: 'macOS-10.15'
+  steps:   
+  - script: | 
+      brew update
+      brew install --cask android-ndk
+      export ANDROID_NDK_HOME=/usr/local/share/android-ndk
+      make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
+
+- job: OSX_IOS_ARMV8
+  pool:
+     vmImage: 'macOS-11'
+  variables:
+     CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
+  steps:
+  - script: |
+     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
+- job: OSX_IOS_ARMV7
+  pool:
+     vmImage: 'macOS-10.15'
+  variables:
+     CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
+  steps:
+  - script: |
+     make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
+- job: ALPINE_MUSL
+  pool:
+     vmImage: 'ubuntu-latest'
+  steps:
+  - script: |
+        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
+          && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc  alpine-chroot-install' | sha1sum -c \
+          || exit 1
+        alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
+        sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
+        alpine make DYNAMIC_ARCH=1 BINARY=64
+        alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install
+        alpine ls -l mytestdir/include
+        alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c
+        alpine echo "#include <openblas_config.h>" >>test_install.c
+        alpine echo "int main(){" >> test_install.c
+        alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c
+        alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install
+        
diff --git a/benchmark/bench.h b/benchmark/bench.h
index 1f9b8986c..c03d72bef 100644
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@@ -3,6 +3,8 @@
 #include <time.h>
 #ifdef __CYGWIN32__
 #include <sys/time.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
 #endif
 #include "common.h"
 
@@ -74,6 +76,9 @@ static void *huge_malloc(BLASLONG size){
 
 #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
   struct timeval start, stop;
+#elif defined(__APPLE__)
+ mach_timebase_info_data_t info;
+ uint64_t start = 0, stop = 0;
 #else
   struct timespec start = { 0, 0 }, stop = { 0, 0 };
 #endif
@@ -82,6 +87,9 @@ double getsec()
 {
 #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
     return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+#elif defined(__APPLE__)
+    mach_timebase_info(&info);
+    return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
 #else
     return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
 #endif
@@ -90,6 +98,8 @@ double getsec()
 void begin() {
 #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
     gettimeofday( &start, (struct timezone *)0);
+#elif defined(__APPLE__)
+    start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
 #else
     clock_gettime(CLOCK_REALTIME, &start);
 #endif
@@ -98,7 +108,9 @@ void begin() {
 void end() {
 #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
     gettimeofday( &stop, (struct timezone *)0);
+#elif defined(__APPLE__)
+    stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
 #else
     clock_gettime(CLOCK_REALTIME, &stop);
 #endif
-}
\ No newline at end of file
+}
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index a0001277a..fc39f3f3d 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -125,7 +125,7 @@ int main(int argc, char *argv[]){
    		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
    		for(j = 0; j < m; j++){
       			for(i = 0; i < n * COMPSIZE; i++){
-				a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+				a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
       			}
    		}
 
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
    		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
    		for(j = 0; j < m; j++){
       			for(i = 0; i < n * COMPSIZE; i++){
-				a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+				a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
       			}
    		}
 
diff --git a/benchmark/getri.c b/benchmark/getri.c
index 98a860906..4c8891226 100644
--- a/benchmark/getri.c
+++ b/benchmark/getri.c
@@ -72,13 +72,17 @@ int main(int argc, char *argv[]){
   FLOAT *a,*work;
   FLOAT wkopt[4];
   blasint *ipiv;
-  blasint m, i, j, info,lwork;
+  blasint m, i, j, l, info,lwork;
 
   int from =   1;
   int to   = 200;
   int step =   1;
+  int loops =  1;
 
-  double time1;
+  double time1,timeg;
+  
+  char *p;
+  char btest = 'I';
 
   argc--;argv++;
 
@@ -86,6 +90,9 @@ int main(int argc, char *argv[]){
   if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
   if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
 
+  if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
+  
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
 
   fprintf(stderr, "From : %3d  To : %3d Step = %3d\n", from, to, step);
 
@@ -124,32 +131,41 @@ int main(int argc, char *argv[]){
   fprintf(stderr, "   SIZE           FLops           Time          Lwork\n");
 
   for(m = from; m <= to; m += step){
-
+    timeg = 0.;
     fprintf(stderr, " %6d : ", (int)m);
 
-    GETRF (&m, &m, a, &m, ipiv, &info);
+    for (l = 0; l < loops; l++) {
 
+    if (btest == 'F') begin();
+    GETRF (&m, &m, a, &m, ipiv, &info);
+    if (btest == 'F') {
+      end();
+      timeg += getsec();
+    }
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    begin();
+    if (btest == 'I') begin();
 
     lwork = -1;
     GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
 
     lwork = (blasint)wkopt[0];
     GETRI(&m, a, &m, ipiv, work, &lwork, &info);
-    end();
+    if (btest == 'I') end();
 
     if (info) {
       fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
       exit(1);
     }
 
-    time1 = getsec();
-
+    if (btest == 'I') 
+      timeg += getsec();
+    
+    } // loops
+    time1 = timeg/(double)loops;
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
 	    COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);
diff --git a/benchmark/linpack.c b/benchmark/linpack.c
index 202035245..32ccb0386 100644
--- a/benchmark/linpack.c
+++ b/benchmark/linpack.c
@@ -72,17 +72,21 @@ int main(int argc, char *argv[]){
   FLOAT *a, *b;
   blasint *ipiv;
 
-  blasint m, i, j, info;
+  blasint m, i, j, l, info;
   blasint unit =   1;
 
   int from =   1;
   int to   = 200;
   int step =   1;
+  int loops =  1;
 
   FLOAT maxerr;
 
-  double time1, time2;
+  double time1, time2, timeg1,timeg2;
 
+  char *p;
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
+  
   argc--;argv++;
 
   if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
@@ -110,9 +114,9 @@ int main(int argc, char *argv[]){
   fprintf(stderr, "   SIZE       Residual     Decompose            Solve           Total\n");
 
   for(m = from; m <= to; m += step){
-
+    timeg1 = timeg2 = 0.;
     fprintf(stderr, " %6d : ", (int)m);
-
+    for (l = 0; l < loops; l++) {
     for(j = 0; j < m; j++){
       for(i = 0; i < m * COMPSIZE; i++){
 	a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
@@ -138,7 +142,7 @@ int main(int argc, char *argv[]){
       exit(1);
     }
 
-    time1 = getsec();
+    timeg1 += getsec();
 
     begin();
 
@@ -151,8 +155,10 @@ int main(int argc, char *argv[]){
       exit(1);
     }
 
-    time2 = getsec();
-
+    timeg2 += getsec();
+    } //loops
+    time1=timeg1/(double)loops;
+    time2=timeg2/(double)loops;
     maxerr = 0.;
 
     for(i = 0; i < m; i++){
diff --git a/benchmark/potrf.c b/benchmark/potrf.c
index 116d0cca5..8808203a5 100644
--- a/benchmark/potrf.c
+++ b/benchmark/potrf.c
@@ -99,14 +99,15 @@ int main(int argc, char *argv[]){
   char *p;
   char btest = 'F';
 
-  blasint m, i, j, info, uplos=0;
-  double flops;
+  blasint m, i, j, l, info, uplos=0;
+  double flops = 0.;
 
   int from =   1;
   int to   = 200;
   int step =   1;
+  int loops =  1;
 
-  double time1;
+  double time1, timeg;
 
   argc--;argv++;
 
@@ -119,6 +120,8 @@ int main(int argc, char *argv[]){
 
   if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
 
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
+
   fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
 
   if (( a    = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
@@ -129,19 +132,21 @@ int main(int argc, char *argv[]){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
 
-  for(m = from; m <= to; m += step){
 
+  for(m = from; m <= to; m += step){
+    timeg=0.;
+	  for (l = 0; l < loops; l++) {
 #ifndef COMPLEX
       if (uplos & 1) {
 	for (j = 0; j < m; j++) {
 	  for(i = 0; i < j; i++)     a[(long)i + (long)j * (long)m] = 0.;
-	                             a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
+	  a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
 	  for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
 	}
       } else {
 	for (j = 0; j < m; j++) {
 	  for(i = 0; i < j; i++)     a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
-	                             a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
+	  a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
 	  for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
 	}
       }
@@ -192,8 +197,8 @@ int main(int argc, char *argv[]){
 	exit(1);
       }
 
-      time1 = getsec();
-      flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
+      if ( btest == 'F')
+	      timeg += getsec();
 
       if ( btest == 'S' )
       {
@@ -214,9 +219,7 @@ int main(int argc, char *argv[]){
 		fprintf(stderr, "Potrs info = %d\n", info);
 		exit(1);
         }
-        time1 = getsec();
-        flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
-
+        timeg += getsec();
       }
 	
       if ( btest == 'I' )
@@ -232,11 +235,17 @@ int main(int argc, char *argv[]){
 		fprintf(stderr, "Potri info = %d\n", info);
 		exit(1);
         }
-
-        time1 = getsec();
-        flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
+        timeg += getsec();
       }
-	
+    } // loops
+
+      time1 = timeg/(double)loops;
+	if ( btest == 'F')
+		flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
+	if ( btest == 'S')
+		flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
+	if ( btest == 'I')
+		flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
       fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);
 
 
diff --git a/benchmark/syr2.c b/benchmark/syr2.c
index acbc86987..61d1036ea 100644
--- a/benchmark/syr2.c
+++ b/benchmark/syr2.c
@@ -46,14 +46,17 @@ int main(int argc, char *argv[]){
 
   if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; 
 
-  blasint m, i, j;
+  blasint m, i, j, l;
   blasint inc_x= 1;
   blasint inc_y= 1;
   int from =   1;
   int to   = 200;
   int step =   1;
+  int loops =  1;
 
-  double time1;
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
+
+  double time1,timeg;
 
   argc--;argv++;
 
@@ -85,8 +88,9 @@ int main(int argc, char *argv[]){
 
   for(m = from; m <= to; m += step)
   {
-	
+    timeg = 0.;	
     fprintf(stderr, " %6d : ", (int)m);
+    for (l = 0; l < loops; l++) {
 	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
 	x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
@@ -107,8 +111,10 @@ int main(int argc, char *argv[]){
 
     end();
 
-    time1 = getsec();
+    timeg += getsec();
+    } // loops
 
+    time1 = timeg/(double)loops;
     fprintf(stderr,
 	    " %10.2f MFlops\n",
 	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index 82606a21a..fa0f24666 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -56,17 +56,20 @@ int main(int argc, char *argv[]){
 
   char uplo='U';
   char trans='N';
-
+  
   if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
   if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
 
-  blasint m, i, j;
+  blasint m, i, j, l;
 
   int from =   1;
   int to   = 200;
   int step =   1;
+  int loops =  1;
+
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
 
-  double time1;
+  double time1,timeg;
 
   argc--;argv++;
 
@@ -95,9 +98,12 @@ int main(int argc, char *argv[]){
 
   for(m = from; m <= to; m += step)
   {
+    timeg = 0.;
 
     fprintf(stderr, " %6d : ", (int)m);
 
+    for(l = 0; l < loops; l++) {
+
     for(j = 0; j < m; j++){
       for(i = 0; i < m * COMPSIZE; i++){
 	a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
@@ -111,8 +117,10 @@ int main(int argc, char *argv[]){
 
     end();
 
-    time1 = getsec();
-
+    timeg += getsec();
+    
+    } //loops
+    time1 = timeg / (double)loops;
     fprintf(stderr,
 	    " %10.2f MFlops\n",
 	    COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
diff --git a/c_check b/c_check
index fe9c53f0e..999f5a7a7 100644
--- a/c_check
+++ b/c_check
@@ -1,11 +1,11 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 #use File::Basename;
 # use File::Temp qw(tempfile);
 
 # Checking cross compile
 $hostos   = `uname -s | sed -e s/\-.*//`;    chop($hostos);
-$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
+$hostarch = `uname -m | sed -e s/i.86/x86/`;
 $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
 chop($hostarch);
 $hostarch = "x86_64" if ($hostarch eq "amd64");
@@ -82,18 +82,20 @@ $os = Interix         if ($data =~ /OS_INTERIX/);
 $os = Android         if ($data =~ /OS_ANDROID/);
 $os = Haiku           if ($data =~ /OS_HAIKU/);
 
-$architecture = x86    if ($data =~ /ARCH_X86/);
-$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
-$architecture = power  if ($data =~ /ARCH_POWER/);
-$architecture = mips   if ($data =~ /ARCH_MIPS/);
-$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
-$architecture = alpha  if ($data =~ /ARCH_ALPHA/);
-$architecture = sparc  if ($data =~ /ARCH_SPARC/);
-$architecture = ia64   if ($data =~ /ARCH_IA64/);
-$architecture = arm    if ($data =~ /ARCH_ARM/);
-$architecture = arm64  if ($data =~ /ARCH_ARM64/);
-$architecture = zarch  if ($data =~ /ARCH_ZARCH/);
-$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
+$architecture = x86          if ($data =~ /ARCH_X86/);
+$architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+$architecture = e2k          if ($data =~ /ARCH_E2K/);
+$architecture = power        if ($data =~ /ARCH_POWER/);
+$architecture = mips         if ($data =~ /ARCH_MIPS/);
+$architecture = mips64       if ($data =~ /ARCH_MIPS64/);
+$architecture = alpha        if ($data =~ /ARCH_ALPHA/);
+$architecture = sparc        if ($data =~ /ARCH_SPARC/);
+$architecture = ia64         if ($data =~ /ARCH_IA64/);
+$architecture = arm          if ($data =~ /ARCH_ARM/);
+$architecture = arm64        if ($data =~ /ARCH_ARM64/);
+$architecture = zarch        if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64      if ($data =~ /ARCH_RISCV64/);
+$architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
 
 $defined = 0;
 
@@ -123,6 +125,11 @@ if ($architecture eq "zarch") {
     $binary = 64;
 }
 
+if ($architecture eq "e2k") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($architecture eq "alpha") {
     $defined = 1;
     $binary = 64;
@@ -143,6 +150,11 @@ if ($architecture eq "riscv64") {
     $binary = 64;
 }
 
+if ($architecture eq "loongarch64") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -199,7 +211,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     } else {
 	$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$code = '"addvi.b $w0, $w1, 1"';
-	$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
+	$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
 	print $tmpf "#include <msa.h>\n\n";
 	print $tmpf "void main(void){ __asm__ volatile($code); }\n";
 
@@ -215,17 +227,19 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     }
 }
 
-$architecture = x86    if ($data =~ /ARCH_X86/);
-$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
-$architecture = power  if ($data =~ /ARCH_POWER/);
-$architecture = mips   if ($data =~ /ARCH_MIPS/);
-$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
-$architecture = alpha  if ($data =~ /ARCH_ALPHA/);
-$architecture = sparc  if ($data =~ /ARCH_SPARC/);
-$architecture = ia64   if ($data =~ /ARCH_IA64/);
-$architecture = arm    if ($data =~ /ARCH_ARM/);
-$architecture = arm64  if ($data =~ /ARCH_ARM64/);
-$architecture = zarch  if ($data =~ /ARCH_ZARCH/);
+$architecture = x86          if ($data =~ /ARCH_X86/);
+$architecture = x86_64       if ($data =~ /ARCH_X86_64/);
+$architecture = e2k          if ($data =~ /ARCH_E2K/);
+$architecture = power        if ($data =~ /ARCH_POWER/);
+$architecture = mips         if ($data =~ /ARCH_MIPS/);
+$architecture = mips64       if ($data =~ /ARCH_MIPS64/);
+$architecture = alpha        if ($data =~ /ARCH_ALPHA/);
+$architecture = sparc        if ($data =~ /ARCH_SPARC/);
+$architecture = ia64         if ($data =~ /ARCH_IA64/);
+$architecture = arm          if ($data =~ /ARCH_ARM/);
+$architecture = arm64        if ($data =~ /ARCH_ARM64/);
+$architecture = zarch        if ($data =~ /ARCH_ZARCH/);
+$architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
 
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
diff --git a/cblas.h b/cblas.h
index da00d46d6..a5ad25ad7 100644
--- a/cblas.h
+++ b/cblas.h
@@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
 
 void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
 void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double  s);
+void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
+void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
 
 void cblas_srotg(float *a, float *b, float *c, float *s);
 void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_crotg(void *a, void *b, float *c, void *s);
+void cblas_zrotg(void *a, void *b, double *c, void *s);
+
 
 void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
 void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
@@ -395,6 +400,8 @@ void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
 void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 
+void   cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		    OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 5457bfb07..f4a135e82 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -44,7 +44,10 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-    set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
+    if (DYNAMIC_LIST)
+	    set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
+    endif ()
   endif ()
   
   if (POWER)
@@ -106,7 +109,11 @@ if (${ARCH} STREQUAL "ia64")
   endif ()
 endif ()
 
-if (MIPS64)
+if (MIPS32 OR MIPS64)
+  set(NO_BINARY_MODE 1)
+endif ()
+
+if (LOONGARCH64)
   set(NO_BINARY_MODE 1)
 endif ()
 
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 76952152b..06bc14986 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
 
   if (NO_BINARY_MODE)
 
+    if (MIPS32)
+        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32")
+      set(BINARY_DEFINED 1)
+    endif ()
+
     if (MIPS64)
       if (BINARY64)
         set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
@@ -29,6 +34,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
       set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
     endif ()
 
+    if (LOONGARCH64)
+      if (BINARY64)
+        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+      else ()
+        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+      endif ()
+      set(BINARY_DEFINED 1)
+    endif ()
+
     if (CMAKE_SYSTEM_NAME STREQUAL "AIX")
       set(BINARY_DEFINED 1)
     endif ()
@@ -117,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE)
   endif ()
 endif ()
 
+if (${CORE} STREQUAL SAPPHIRERAPIDS)
+  if (NOT DYNAMIC_ARCH)
+    if (NOT NO_AVX512)
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
+        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=sapphirerapids")
+      else ()
+        set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
+      endif()  
+    endif ()
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL A64FX)
+  if (NOT DYNAMIC_ARCH)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+    endif()
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL ARMV8SVE)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL POWER10)
+  if (NOT DYNAMIC_ARCH)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
+    else ()
+      message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
+    endif()
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL POWER9)
+  if (NOT DYNAMIC_ARCH)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
+    else ()
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
+      message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
+    endif ()
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL POWER8)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
+  endif ()
+endif ()
+
 if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX2)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx2")
@@ -124,9 +197,9 @@ if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx")
 	endif ()
-	if (HAVE_FMA3)
-	set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
-	endif ()
+	#	if (HAVE_FMA3)
+	#set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
+	#endif ()
 	if (HAVE_SSE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -msse")
 	endif ()
diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake
index 0f5d0e15d..14683ed21 100644
--- a/cmake/f_check.cmake
+++ b/cmake/f_check.cmake
@@ -20,19 +20,16 @@
 # NEEDBUNDERSCORE
 # NEED2UNDERSCORES
 
-if (NOT NO_LAPACK)
-  include(CheckLanguage)
-  check_language(Fortran)
-  if(CMAKE_Fortran_COMPILER)
-    enable_language(Fortran)
-  else()
-  message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
+include(CheckLanguage)
+check_language(Fortran)
+if(CMAKE_Fortran_COMPILER)
+  enable_language(Fortran)
+else()
+  if (NOT NO_LAPACK)
+    message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
+  endif()
   set (NOFORTRAN 1)
   set (NO_LAPACK 1)
-  endif()
-else()
-  include(CMakeForceCompiler)
-  CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
 endif()
 
 if (NOT ONLY_CBLAS)
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index fc1f9bb22..9feda9be3 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -3,11 +3,6 @@
 ## Description: Ported from portion of OpenBLAS/Makefile.system
 ##              Sets Fortran related variables.
 
-if (INTERFACE64)
-  set(SUFFIX64 64)
-  set(SUFFIX64_UNDERSCORE _64)
-endif()
-
 if (${F_COMPILER} STREQUAL "FLANG")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
   if (BINARY64 AND INTERFACE64)
@@ -61,6 +56,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
       endif ()
     endif ()
+    if (LOONGARCH64)
+      if (BINARY64)
+        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+      else ()
+        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+      endif ()
+    endif ()
   else ()
     if (BINARY64)
       set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
@@ -97,7 +99,7 @@ endif ()
 
 if (${F_COMPILER} STREQUAL "IBM")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
-  # FCOMMON_OPT	+= -qarch=440
+  set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
   if (BINARY64)
     set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
     if (INTERFACE64)
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 0c102bae5..efededcf3 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -1,212 +1,218 @@
 # helper functions for the kernel CMakeLists.txt
 
+function(SetFallback KERNEL SOURCE_PATH)
+  if (NOT (DEFINED ${KERNEL}))
+    set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE)
+  endif ()
+endfunction()
 
-# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
 macro(SetDefaultL1)
-  set(SAMAXKERNEL amax.S)
-  set(DAMAXKERNEL amax.S)
-  set(QAMAXKERNEL amax.S)
-  set(CAMAXKERNEL zamax.S)
-  set(ZAMAXKERNEL zamax.S)
-  set(XAMAXKERNEL zamax.S)
-  set(SAMINKERNEL amin.S)
-  set(DAMINKERNEL amin.S)
-  set(QAMINKERNEL amin.S)
-  set(CAMINKERNEL zamin.S)
-  set(ZAMINKERNEL zamin.S)
-  set(XAMINKERNEL zamin.S)
-  set(SMAXKERNEL max.S)
-  set(DMAXKERNEL max.S)
-  set(QMAXKERNEL max.S)
-  set(SMINKERNEL min.S)
-  set(DMINKERNEL min.S)
-  set(QMINKERNEL min.S)
-  set(ISAMAXKERNEL iamax.S)
-  set(IDAMAXKERNEL iamax.S)
-  set(IQAMAXKERNEL iamax.S)
-  set(ICAMAXKERNEL izamax.S)
-  set(IZAMAXKERNEL izamax.S)
-  set(IXAMAXKERNEL izamax.S)
-  set(ISAMINKERNEL iamin.S)
-  set(IDAMINKERNEL iamin.S)
-  set(IQAMINKERNEL iamin.S)
-  set(ICAMINKERNEL izamin.S)
-  set(IZAMINKERNEL izamin.S)
-  set(IXAMINKERNEL izamin.S)
-  set(ISMAXKERNEL iamax.S)
-  set(IDMAXKERNEL iamax.S)
-  set(IQMAXKERNEL iamax.S)
-  set(ISMINKERNEL iamin.S)
-  set(IDMINKERNEL iamin.S)
-  set(IQMINKERNEL iamin.S)
-  set(SASUMKERNEL asum.S)
-  set(DASUMKERNEL asum.S)
-  set(CASUMKERNEL zasum.S)
-  set(ZASUMKERNEL zasum.S)
-  set(QASUMKERNEL asum.S)
-  set(XASUMKERNEL zasum.S)
-  set(SAXPYKERNEL axpy.S)
-  set(DAXPYKERNEL axpy.S)
-  set(CAXPYKERNEL zaxpy.S)
-  set(ZAXPYKERNEL zaxpy.S)
-  set(QAXPYKERNEL axpy.S)
-  set(XAXPYKERNEL zaxpy.S)
-  set(SCOPYKERNEL copy.S)
-  set(DCOPYKERNEL copy.S)
-  set(CCOPYKERNEL zcopy.S)
-  set(ZCOPYKERNEL zcopy.S)
-  set(QCOPYKERNEL copy.S)
-  set(XCOPYKERNEL zcopy.S)
-  set(SDOTKERNEL dot.S)
-  set(DDOTKERNEL dot.S)
-  set(CDOTKERNEL zdot.S)
-  set(ZDOTKERNEL zdot.S)
-  set(QDOTKERNEL dot.S)
-  set(XDOTKERNEL zdot.S)
-  set(SNRM2KERNEL nrm2.S)
-  set(DNRM2KERNEL nrm2.S)
-  set(QNRM2KERNEL nrm2.S)
-  set(CNRM2KERNEL znrm2.S)
-  set(ZNRM2KERNEL znrm2.S)
-  set(XNRM2KERNEL znrm2.S)
-  set(SROTKERNEL rot.S)
-  set(DROTKERNEL rot.S)
-  set(QROTKERNEL rot.S)
-  set(CROTKERNEL zrot.S)
-  set(ZROTKERNEL zrot.S)
-  set(XROTKERNEL zrot.S)
-  set(SSCALKERNEL scal.S)
-  set(DSCALKERNEL scal.S)
-  set(CSCALKERNEL zscal.S)
-  set(ZSCALKERNEL zscal.S)
-  set(QSCALKERNEL scal.S)
-  set(XSCALKERNEL zscal.S)
-  set(SSWAPKERNEL swap.S)
-  set(DSWAPKERNEL swap.S)
-  set(CSWAPKERNEL zswap.S)
-  set(ZSWAPKERNEL zswap.S)
-  set(QSWAPKERNEL swap.S)
-  set(XSWAPKERNEL zswap.S)
-  set(SGEMVNKERNEL gemv_n.S)
-  set(SGEMVTKERNEL gemv_t.S)
-  set(DGEMVNKERNEL gemv_n.S)
-  set(DGEMVTKERNEL gemv_t.S)
-  set(CGEMVNKERNEL zgemv_n.S)
-  set(CGEMVTKERNEL zgemv_t.S)
-  set(ZGEMVNKERNEL zgemv_n.S)
-  set(ZGEMVTKERNEL zgemv_t.S)
-  set(QGEMVNKERNEL gemv_n.S)
-  set(QGEMVTKERNEL gemv_t.S)
-  set(XGEMVNKERNEL zgemv_n.S)
-  set(XGEMVTKERNEL zgemv_t.S)
-  set(SCABS_KERNEL ../generic/cabs.c)
-  set(DCABS_KERNEL ../generic/cabs.c)
-  set(QCABS_KERNEL ../generic/cabs.c)
-  set(LSAME_KERNEL ../generic/lsame.c)
-  set(SAXPBYKERNEL ../arm/axpby.c)
-  set(DAXPBYKERNEL ../arm/axpby.c)
-  set(CAXPBYKERNEL ../arm/zaxpby.c)
-  set(ZAXPBYKERNEL ../arm/zaxpby.c)
-  set(SSUMKERNEL sum.S)
-  set(DSUMKERNEL sum.S)
-  set(CSUMKERNEL zsum.S)
-  set(ZSUMKERNEL zsum.S)
-  set(QSUMKERNEL sum.S)
-  set(XSUMKERNEL zsum.S)
+  SetFallback(SAMAXKERNEL amax.S)
+  SetFallback(DAMAXKERNEL amax.S)
+  SetFallback(QAMAXKERNEL amax.S)
+  SetFallback(CAMAXKERNEL zamax.S)
+  SetFallback(ZAMAXKERNEL zamax.S)
+  SetFallback(XAMAXKERNEL zamax.S)
+  SetFallback(SAMINKERNEL amin.S)
+  SetFallback(DAMINKERNEL amin.S)
+  SetFallback(QAMINKERNEL amin.S)
+  SetFallback(CAMINKERNEL zamin.S)
+  SetFallback(ZAMINKERNEL zamin.S)
+  SetFallback(XAMINKERNEL zamin.S)
+  SetFallback(SMAXKERNEL max.S)
+  SetFallback(DMAXKERNEL max.S)
+  SetFallback(QMAXKERNEL max.S)
+  SetFallback(SMINKERNEL min.S)
+  SetFallback(DMINKERNEL min.S)
+  SetFallback(QMINKERNEL min.S)
+  SetFallback(ISAMAXKERNEL iamax.S)
+  SetFallback(IDAMAXKERNEL iamax.S)
+  SetFallback(IQAMAXKERNEL iamax.S)
+  SetFallback(ICAMAXKERNEL izamax.S)
+  SetFallback(IZAMAXKERNEL izamax.S)
+  SetFallback(IXAMAXKERNEL izamax.S)
+  SetFallback(ISAMINKERNEL iamin.S)
+  SetFallback(IDAMINKERNEL iamin.S)
+  SetFallback(IQAMINKERNEL iamin.S)
+  SetFallback(ICAMINKERNEL izamin.S)
+  SetFallback(IZAMINKERNEL izamin.S)
+  SetFallback(IXAMINKERNEL izamin.S)
+  SetFallback(ISMAXKERNEL iamax.S)
+  SetFallback(IDMAXKERNEL iamax.S)
+  SetFallback(IQMAXKERNEL iamax.S)
+  SetFallback(ISMINKERNEL iamin.S)
+  SetFallback(IDMINKERNEL iamin.S)
+  SetFallback(IQMINKERNEL iamin.S)
+  SetFallback(SASUMKERNEL asum.S)
+  SetFallback(DASUMKERNEL asum.S)
+  SetFallback(CASUMKERNEL zasum.S)
+  SetFallback(ZASUMKERNEL zasum.S)
+  SetFallback(QASUMKERNEL asum.S)
+  SetFallback(XASUMKERNEL zasum.S)
+  SetFallback(SAXPYKERNEL axpy.S)
+  SetFallback(DAXPYKERNEL axpy.S)
+  SetFallback(CAXPYKERNEL zaxpy.S)
+  SetFallback(ZAXPYKERNEL zaxpy.S)
+  SetFallback(QAXPYKERNEL axpy.S)
+  SetFallback(XAXPYKERNEL zaxpy.S)
+  SetFallback(SCOPYKERNEL copy.S)
+  SetFallback(DCOPYKERNEL copy.S)
+  SetFallback(CCOPYKERNEL zcopy.S)
+  SetFallback(ZCOPYKERNEL zcopy.S)
+  SetFallback(QCOPYKERNEL copy.S)
+  SetFallback(XCOPYKERNEL zcopy.S)
+  SetFallback(SDOTKERNEL dot.S)
+  SetFallback(DDOTKERNEL dot.S)
+  SetFallback(CDOTKERNEL zdot.S)
+  SetFallback(ZDOTKERNEL zdot.S)
+  SetFallback(QDOTKERNEL dot.S)
+  SetFallback(XDOTKERNEL zdot.S)
+  SetFallback(SNRM2KERNEL nrm2.S)
+  SetFallback(DNRM2KERNEL nrm2.S)
+  SetFallback(QNRM2KERNEL nrm2.S)
+  SetFallback(CNRM2KERNEL znrm2.S)
+  SetFallback(ZNRM2KERNEL znrm2.S)
+  SetFallback(XNRM2KERNEL znrm2.S)
+  SetFallback(SROTKERNEL rot.S)
+  SetFallback(DROTKERNEL rot.S)
+  SetFallback(QROTKERNEL rot.S)
+  SetFallback(CROTKERNEL zrot.S)
+  SetFallback(ZROTKERNEL zrot.S)
+  SetFallback(XROTKERNEL zrot.S)
+  SetFallback(SSCALKERNEL scal.S)
+  SetFallback(DSCALKERNEL scal.S)
+  SetFallback(CSCALKERNEL zscal.S)
+  SetFallback(ZSCALKERNEL zscal.S)
+  SetFallback(QSCALKERNEL scal.S)
+  SetFallback(XSCALKERNEL zscal.S)
+  SetFallback(SSWAPKERNEL swap.S)
+  SetFallback(DSWAPKERNEL swap.S)
+  SetFallback(CSWAPKERNEL zswap.S)
+  SetFallback(ZSWAPKERNEL zswap.S)
+  SetFallback(QSWAPKERNEL swap.S)
+  SetFallback(XSWAPKERNEL zswap.S)
+  SetFallback(SGEMVNKERNEL gemv_n.S)
+  SetFallback(SGEMVTKERNEL gemv_t.S)
+  SetFallback(DGEMVNKERNEL gemv_n.S)
+  SetFallback(DGEMVTKERNEL gemv_t.S)
+  SetFallback(CGEMVNKERNEL zgemv_n.S)
+  SetFallback(CGEMVTKERNEL zgemv_t.S)
+  SetFallback(ZGEMVNKERNEL zgemv_n.S)
+  SetFallback(ZGEMVTKERNEL zgemv_t.S)
+  SetFallback(QGEMVNKERNEL gemv_n.S)
+  SetFallback(QGEMVTKERNEL gemv_t.S)
+  SetFallback(XGEMVNKERNEL zgemv_n.S)
+  SetFallback(XGEMVTKERNEL zgemv_t.S)
+  SetFallback(SCABS_KERNEL ../generic/cabs.c)
+  SetFallback(DCABS_KERNEL ../generic/cabs.c)
+  SetFallback(QCABS_KERNEL ../generic/cabs.c)
+  SetFallback(LSAME_KERNEL ../generic/lsame.c)
+  SetFallback(SAXPBYKERNEL ../arm/axpby.c)
+  SetFallback(DAXPBYKERNEL ../arm/axpby.c)
+  SetFallback(CAXPBYKERNEL ../arm/zaxpby.c)
+  SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c)
+  SetFallback(SSUMKERNEL sum.S)
+  SetFallback(DSUMKERNEL sum.S)
+  SetFallback(CSUMKERNEL zsum.S)
+  SetFallback(ZSUMKERNEL zsum.S)
+  SetFallback(QSUMKERNEL sum.S)
+  SetFallback(XSUMKERNEL zsum.S)
 if (BUILD_BFLOAT16)
-  set(SHAMINKERNEL ../arm/amin.c)
-  set(SHAMAXKERNEL ../arm/amax.c)
-  set(SHMAXKERNEL ../arm/max.c)
-  set(SHMINKERNEL ../arm/min.c)
-  set(ISHAMAXKERNEL ../arm/iamax.c)
-  set(ISHAMINKERNEL ../arm/iamin.c)
-  set(ISHMAXKERNEL ../arm/imax.c)
-  set(ISHMINKERNEL ../arm/imin.c)
-  set(SHASUMKERNEL ../arm/asum.c)
-  set(SHAXPYKERNEL ../arm/axpy.c)
-  set(SHAXPBYKERNEL ../arm/axpby.c)
-  set(SHCOPYKERNEL ../arm/copy.c)
-  set(SBDOTKERNEL ../x86_64/sbdot.c)
-  set(SHROTKERNEL ../arm/rot.c)
-  set(SHSCALKERNEL ../arm/scal.c)
-  set(SHNRM2KERNEL ../arm/nrm2.c)
-  set(SHSUMKERNEL ../arm/sum.c)
-  set(SHSWAPKERNEL ../arm/swap.c)
-  set(TOBF16KERNEL ../x86_64/tobf16.c)
-  set(BF16TOKERNEL ../x86_64/bf16to.c)
+  SetFallback(SHAMINKERNEL ../arm/amin.c)
+  SetFallback(SHAMAXKERNEL ../arm/amax.c)
+  SetFallback(SHMAXKERNEL ../arm/max.c)
+  SetFallback(SHMINKERNEL ../arm/min.c)
+  SetFallback(ISHAMAXKERNEL ../arm/iamax.c)
+  SetFallback(ISHAMINKERNEL ../arm/iamin.c)
+  SetFallback(ISHMAXKERNEL ../arm/imax.c)
+  SetFallback(ISHMINKERNEL ../arm/imin.c)
+  SetFallback(SHASUMKERNEL ../arm/asum.c)
+  SetFallback(SHAXPYKERNEL ../arm/axpy.c)
+  SetFallback(SHAXPBYKERNEL ../arm/axpby.c)
+  SetFallback(SHCOPYKERNEL ../arm/copy.c)
+  SetFallback(SBDOTKERNEL ../x86_64/sbdot.c)
+  SetFallback(SHROTKERNEL ../arm/rot.c)
+  SetFallback(SHSCALKERNEL ../arm/scal.c)
+  SetFallback(SHNRM2KERNEL ../arm/nrm2.c)
+  SetFallback(SHSUMKERNEL ../arm/sum.c)
+  SetFallback(SHSWAPKERNEL ../arm/swap.c)
+  SetFallback(TOBF16KERNEL ../x86_64/tobf16.c)
+  SetFallback(BF16TOKERNEL ../x86_64/bf16to.c)
+  SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+  SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
 endif ()
 endmacro ()
 
 macro(SetDefaultL2)
-  set(SGEMVNKERNEL ../arm/gemv_n.c)
-  set(SGEMVTKERNEL ../arm/gemv_t.c)
-  set(DGEMVNKERNEL gemv_n.S)
-  set(DGEMVTKERNEL gemv_t.S)
-  set(CGEMVNKERNEL zgemv_n.S)
-  set(CGEMVTKERNEL zgemv_t.S)
-  set(ZGEMVNKERNEL zgemv_n.S)
-  set(ZGEMVTKERNEL zgemv_t.S)
-  set(QGEMVNKERNEL gemv_n.S)
-  set(QGEMVTKERNEL gemv_t.S)
-  set(XGEMVNKERNEL zgemv_n.S)
-  set(XGEMVTKERNEL zgemv_t.S)
-  set(SGERKERNEL ../generic/ger.c)
-  set(DGERKERNEL ../generic/ger.c)
-  set(QGERKERNEL ../generic/ger.c)
-  set(CGERUKERNEL ../generic/zger.c)
-  set(CGERCKERNEL ../generic/zger.c)
-  set(ZGERUKERNEL ../generic/zger.c)
-  set(ZGERCKERNEL ../generic/zger.c)
-  set(XGERUKERNEL ../generic/zger.c)
-  set(XGERCKERNEL ../generic/zger.c)
-  set(SSYMV_U_KERNEL ../generic/symv_k.c)
-  set(SSYMV_L_KERNEL ../generic/symv_k.c)
-  set(DSYMV_U_KERNEL ../generic/symv_k.c)
-  set(DSYMV_L_KERNEL ../generic/symv_k.c)
-  set(QSYMV_U_KERNEL ../generic/symv_k.c)
-  set(QSYMV_L_KERNEL ../generic/symv_k.c)
-  set(CSYMV_U_KERNEL ../generic/zsymv_k.c)
-  set(CSYMV_L_KERNEL ../generic/zsymv_k.c)
-  set(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
-  set(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
-  set(XSYMV_U_KERNEL ../generic/zsymv_k.c)
-  set(XSYMV_L_KERNEL ../generic/zsymv_k.c)
-  set(CHEMV_U_KERNEL ../generic/zhemv_k.c)
-  set(CHEMV_L_KERNEL ../generic/zhemv_k.c)
-  set(CHEMV_V_KERNEL ../generic/zhemv_k.c)
-  set(CHEMV_M_KERNEL ../generic/zhemv_k.c)
-  set(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
-  set(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
-  set(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
-  set(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
-  set(XHEMV_U_KERNEL ../generic/zhemv_k.c)
-  set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
-  set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
-  set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
+  SetFallback(SGEMVNKERNEL ../arm/gemv_n.c)
+  SetFallback(SGEMVTKERNEL ../arm/gemv_t.c)
+  SetFallback(DGEMVNKERNEL gemv_n.S)
+  SetFallback(DGEMVTKERNEL gemv_t.S)
+  SetFallback(CGEMVNKERNEL zgemv_n.S)
+  SetFallback(CGEMVTKERNEL zgemv_t.S)
+  SetFallback(ZGEMVNKERNEL zgemv_n.S)
+  SetFallback(ZGEMVTKERNEL zgemv_t.S)
+  SetFallback(QGEMVNKERNEL gemv_n.S)
+  SetFallback(QGEMVTKERNEL gemv_t.S)
+  SetFallback(XGEMVNKERNEL zgemv_n.S)
+  SetFallback(XGEMVTKERNEL zgemv_t.S)
+  SetFallback(SGERKERNEL ../generic/ger.c)
+  SetFallback(DGERKERNEL ../generic/ger.c)
+  SetFallback(QGERKERNEL ../generic/ger.c)
+  SetFallback(CGERUKERNEL ../generic/zger.c)
+  SetFallback(CGERCKERNEL ../generic/zger.c)
+  SetFallback(ZGERUKERNEL ../generic/zger.c)
+  SetFallback(ZGERCKERNEL ../generic/zger.c)
+  SetFallback(XGERUKERNEL ../generic/zger.c)
+  SetFallback(XGERCKERNEL ../generic/zger.c)
+  SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c)
+  SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c)
+  SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c)
+  SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c)
+  SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c)
+  SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c)
+  SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c)
+  SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c)
+  SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
+  SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
+  SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c)
+  SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c)
+  SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c)
+  SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c)
+  SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c)
+  SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c)
+  SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
+  SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
+  SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
+  SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
+  SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c)
+  SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c)
+  SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c)
+  SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c)
 if (BUILD_BFLOAT16)
-  set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
-  set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
-  set(SHGERKERNEL ../generic/ger.c)
+  SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+  SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
+  SetFallback(SHGERKERNEL ../generic/ger.c)
 endif ()
 endmacro ()
 
 macro(SetDefaultL3)
-  set(SGEADD_KERNEL ../generic/geadd.c)
-  set(DGEADD_KERNEL ../generic/geadd.c)
-  set(CGEADD_KERNEL ../generic/zgeadd.c)
-  set(ZGEADD_KERNEL ../generic/zgeadd.c)
+  SetFallback(SGEADD_KERNEL ../generic/geadd.c)
+  SetFallback(DGEADD_KERNEL ../generic/geadd.c)
+  SetFallback(CGEADD_KERNEL ../generic/zgeadd.c)
+  SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c)
 if (BUILD_BFLOAT16)
-  set(SHGEADD_KERNEL ../generic/geadd.c)
-  set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
-  set(SBGEMM_BETA  ../generic/gemm_beta.c)
-  set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
-  set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
-  set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
-  set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
-  set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
-  set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
-  set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
-  set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
+  SetFallback(SHGEADD_KERNEL ../generic/geadd.c)
+  SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
+  SetFallback(SBGEMM_BETA  ../generic/gemm_beta.c)
+  SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
+  SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
+  SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
+  SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
+  SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o)
+  SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
+  SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
+  SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
 endif ()
 
 endmacro ()
diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 73f2592ef..0e45d4c63 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -66,7 +66,7 @@ set(SLASRC
    slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
+   slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
    slarrv.f slartv.f
    slarz.f  slarzb.f slarzt.f slasy2.f
    slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@@ -112,14 +112,14 @@ set(SLASRC
    sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
    stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
    sgelqt.f sgelqt3.f sgemlqt.f
-   sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
+   sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
    sgelq.f slaswlq.f slamswlq.f sgemlq.f
    stplqt.f stplqt2.f stpmlqt.f
    ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
    ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
-   slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -171,7 +171,7 @@ set(CLASRC
    claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarfb.f clarfg.f clarfgp.f clarft.f
+   clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
    clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
    clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f
    clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -209,14 +209,14 @@ set(CLASRC
    cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
    ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
    cgelqt.f cgelqt3.f cgemlqt.f
-   cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
+   cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
    cgelq.f claswlq.f clamswlq.f cgemlq.f
    ctplqt.f ctplqt2.f ctpmlqt.f
    chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
    cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
-   cungtsqr.f cunhr_col.f )
+   cungtsqr.f cungtsqr_row.f cunhr_col.f )
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -253,7 +253,7 @@ set(DLASRC
    dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
    dlargv.f dlarrv.f dlartv.f
    dlarz.f  dlarzb.f dlarzt.f dlasy2.f
    dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@@ -300,14 +300,14 @@ set(DLASRC
    dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
    dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
    dgelqt.f dgelqt3.f dgemlqt.f
-   dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
+   dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
    dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
    dtplqt.f dtplqt2.f dtpmlqt.f
    dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
    dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
-   dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -360,7 +360,7 @@ set(ZLASRC
    zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
    zlarfg.f zlarfgp.f zlarft.f
    zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
    zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
@@ -402,13 +402,13 @@ set(ZLASRC
    ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
    ztplqt.f ztplqt2.f ztpmlqt.f
    zgelqt.f zgelqt3.f zgemlqt.f
-   zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
+   zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
    zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
    zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
    zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
-   zungtsqr.f zunhr_col.f)
+   zungtsqr.f zungtsqr_row.f zunhr_col.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index f10905c4d..340ea6d6c 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -114,6 +114,8 @@ set(CSRC
   lapacke_cgetrs_work.c
   lapacke_cgetsls.c
   lapacke_cgetsls_work.c
+  lapacke_cgetsqrhrt.c
+  lapacke_cgetsqrhrt_work.c 
   lapacke_cggbak.c
   lapacke_cggbak_work.c
   lapacke_cggbal.c
@@ -590,6 +592,8 @@ set(CSRC
   lapacke_cungrq_work.c
   lapacke_cungtr.c
   lapacke_cungtr_work.c
+  lapacke_cungtsqr_row.c
+  lapacke_cungtsqr_row_work.c
   lapacke_cunmbr.c
   lapacke_cunmbr_work.c
   lapacke_cunmhr.c
@@ -735,6 +739,8 @@ set(DSRC
   lapacke_dgetrs_work.c
   lapacke_dgetsls.c
   lapacke_dgetsls_work.c
+  lapacke_dgetsqrhrt.c
+  lapacke_dgetsqrhrt_work.c
   lapacke_dggbak.c
   lapacke_dggbak_work.c
   lapacke_dggbal.c
@@ -862,6 +868,8 @@ set(DSRC
   lapacke_dorgrq_work.c
   lapacke_dorgtr.c
   lapacke_dorgtr_work.c
+  lapacke_dorgtsqr_row.c
+  lapacke_dorgtsqr_row_work.c
   lapacke_dormbr.c
   lapacke_dormbr_work.c
   lapacke_dormhr.c
@@ -1309,6 +1317,8 @@ set(SSRC
   lapacke_sgetrs_work.c
   lapacke_sgetsls.c
   lapacke_sgetsls_work.c
+  lapacke_sgetsqrhrt.c
+  lapacke_sgetsqrhrt_work.c
   lapacke_sggbak.c
   lapacke_sggbak_work.c
   lapacke_sggbal.c
@@ -1435,6 +1445,8 @@ set(SSRC
   lapacke_sorgrq_work.c
   lapacke_sorgtr.c
   lapacke_sorgtr_work.c
+  lapacke_sorgtsqr_row.c
+  lapacke_sorgtsqr_row_work.c
   lapacke_sormbr.c
   lapacke_sormbr_work.c
   lapacke_sormhr.c
@@ -1877,6 +1889,8 @@ set(ZSRC
   lapacke_zgetrs_work.c
   lapacke_zgetsls.c
   lapacke_zgetsls_work.c
+  lapacke_zgetsqrhrt.c
+  lapacke_zgetsqrhrt_work.c
   lapacke_zggbak.c
   lapacke_zggbak_work.c
   lapacke_zggbal.c
@@ -2351,6 +2365,8 @@ set(ZSRC
   lapacke_zungrq_work.c
   lapacke_zungtr.c
   lapacke_zungtr_work.c
+  lapacke_zungtsqr_row.c
+  lapacke_zungtsqr_row_work.c
   lapacke_zunmbr.c
   lapacke_zunmbr_work.c
   lapacke_zunmhr.c
@@ -2499,6 +2515,5 @@ foreach (Utils_FILE ${Utils_SRC})
 endforeach ()
 
 set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
-configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
 include_directories(${lapacke_include_dir})
 set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index da7686c33..4ef0ce93a 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       "#define DLOCAL_BUFFER_SIZE\t16384\n"
       "#define CLOCAL_BUFFER_SIZE\t16384\n"
       "#define ZLOCAL_BUFFER_SIZE\t16384\n")
+      set(HAVE_SSE 1)
+      set(HAVE_SSE2 1)
+      set(HAVE_SSE3 1)
+      set(HAVE_SSSE3 1)
       set(SGEMM_UNROLL_M 8)
       set(SGEMM_UNROLL_N 4)
       set(DGEMM_UNROLL_M 4)
@@ -177,7 +181,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
-  elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
+  elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t32768\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -237,6 +241,61 @@ endif ()
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
   elseif ("${TCORE}" STREQUAL "NEOVERSEN1")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t65536\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
+      "#define L2_SIZE\t1048576\n\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t48\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define HAVE_VFPV4\n"
+      "#define HAVE_VFPV3\n"
+      "#define HAVE_VFP\n"
+      "#define HAVE_NEON\n"
+      "#define ARMV8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "NEOVERSEV1")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t65536\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
+      "#define L2_SIZE\t1048576\n\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t48\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define HAVE_VFPV4\n"
+      "#define HAVE_VFPV3\n"
+      "#define HAVE_VFP\n"
+      "#define HAVE_NEON\n"
+      "#define HAVE_SVE\n"
+      "#define ARMV8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t65536\n"
       "#define L1_CODE_LINESIZE\t64\n"
@@ -246,13 +305,14 @@ endif ()
       "#define L1_DATA_ASSOCIATIVE\t2\n"
       "#define L2_SIZE\t1048576\n\n"
       "#define L2_LINESIZE\t64\n"
-      "#define L2_ASSOCIATIVE\t16\n"
-      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t48\n"
       "#define DTB_SIZE\t4096\n"
       "#define HAVE_VFPV4\n"
       "#define HAVE_VFPV3\n"
       "#define HAVE_VFP\n"
       "#define HAVE_NEON\n"
+      "#define HAVE_SVE\n"
       "#define ARMV8\n")
     set(SGEMM_UNROLL_M 16)
     set(SGEMM_UNROLL_N 4)
@@ -416,7 +476,7 @@ endif ()
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
-elseif ("${TCORE}" STREQUAL "VORTEX")
+  elseif ("${TCORE}" STREQUAL "VORTEX")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define ARMV8\n"
       "#define L1_CODE_SIZE\t32768\n"
@@ -439,6 +499,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX")
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "P5600")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L2_SIZE 1048576\n"
+      "#define DTB_SIZE 4096\n"
+      "#define DTB_DEFAULT_ENTRIES 64\n")
+    set(SGEMM_UNROLL_M 2)
+    set(SGEMM_UNROLL_N 2)
+    set(DGEMM_UNROLL_M 2)
+    set(DGEMM_UNROLL_N 2)
+    set(CGEMM_UNROLL_M 2)
+    set(CGEMM_UNROLL_N 2)
+    set(ZGEMM_UNROLL_M 2)
+    set(ZGEMM_UNROLL_N 2)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" MATCHES "MIPS")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L2_SIZE 262144\n"
+      "#define DTB_SIZE 4096\n"
+      "#define DTB_DEFAULT_ENTRIES 64\n")
+    set(SGEMM_UNROLL_M 2)
+    set(SGEMM_UNROLL_N 2)
+    set(DGEMM_UNROLL_M 2)
+    set(DGEMM_UNROLL_N 2)
+    set(CGEMM_UNROLL_M 2)
+    set(CGEMM_UNROLL_N 2)
+    set(ZGEMM_UNROLL_M 2)
+    set(ZGEMM_UNROLL_N 2)
+    set(SYMV_P 16)
   elseif ("${TCORE}" STREQUAL "POWER6")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE 32768\n"
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 66e95c6d3..e0e92bde7 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -33,15 +33,18 @@ endif ()
 if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
   set(NO_AVX 1)
-  if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
+  if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS")
     set(TARGET "NEHALEM")
   endif ()
   if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
     set(TARGET "BARCELONA")
   endif ()
-  if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
+  if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
     set(TARGET "ARMV7")
   endif ()
+  if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10")
+    set(TARGET "POWER6")
+  endif ()
 endif ()
 
 
@@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc)
   set(GETARCH_FLAGS	"${GETARCH_FLAGS} -static")
 endif ()
 
+if (POWER)
+  set(NO_WARMUP 1)
+  set(HAVE_GAS 1)
+  if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU")
+    set(HAVE_GAS 0)
+  elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang")
+    set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as")
+    set(HAVE_GAS 0)
+  endif ()
+  set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}")
+endif ()
+
 #if don't use Fortran, it will only compile CBLAS.
 if (ONLY_CBLAS)
   set(NO_LAPACK 1)
@@ -148,16 +163,36 @@ endif ()
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
 if (DEFINED TARGET)
   if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
-#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+        if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
           set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
         else()
           set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
         endif()
-#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-#    endif()    
+    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
+         if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+    endif()    
+  endif()
+  if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+        if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
+         if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+    endif()    
   endif()
   if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
@@ -182,11 +217,11 @@ if (DEFINED TARGET)
       	  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
 	endif()
   endif()
-  if (DEFINED HAVE_FMA3)
-	if (NOT NO_AVX2)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
-	endif()
-  endif()
+  #  if (DEFINED HAVE_FMA3)
+  #	if (NOT NO_AVX2)
+  #  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
+  #	endif()
+  #  endif()
     if (DEFINED HAVE_SSE)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
   endif()
@@ -202,6 +237,27 @@ if (DEFINED TARGET)
     if (DEFINED HAVE_SSE4_1)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
   endif()
+
+  if (${TARGET} STREQUAL POWER10)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
+    else ()
+      message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
+    endif()
+  endif()
+  if (${TARGET} STREQUAL POWER9)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
+    else ()
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
+      message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
+    endif()
+  endif()
+  if (${TARGET} STREQUAL POWER8)
+    set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
+  endif()
 endif()
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
@@ -219,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
 # C Compiler dependent settings
 include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
 
+if (INTERFACE64)
+  set(SUFFIX64 64)
+  set(SUFFIX64_UNDERSCORE _64)
+endif()
+
 if (NOT NOFORTRAN)
   # Fortran Compiler dependent settings
   include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
@@ -233,6 +294,11 @@ if (BINARY64)
   endif ()
 endif ()
 
+if(EMBEDDED)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
+  set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
+endif()
+
 if (NEED_PIC)
   if (${CMAKE_C_COMPILER} STREQUAL "IBM")
     set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
@@ -249,8 +315,15 @@ if (NEED_PIC)
   endif()
 endif ()
 
+if (X86_64 OR ${CORE} STREQUAL POWER10)
+  set(SMALL_MATRIX_OPT TRUE)
+endif ()
+if (SMALL_MATRIX_OPT)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
+endif ()
+
 if (DYNAMIC_ARCH)
-  if (X86 OR X86_64 OR ARM64 OR PPC)
+  if (X86 OR X86_64 OR ARM64 OR POWER)
     set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
     if (DYNAMIC_OLDER)
       set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@@ -290,6 +363,10 @@ if (NO_AVX2)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
 endif ()
 
+if (NO_AVX512)
+  set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
+endif ()
+
 if (USE_THREAD)
   # USE_SIMPLE_THREADED_LEVEL3 = 1
   # NO_AFFINITY = 1
@@ -449,6 +526,9 @@ endif()
 if (BUILD_COMPLEX16)
 	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
 endif()
+if (BUILD_BFLOAT16)
+       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
+endif()
 if(NOT MSVC)
 set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
 endif()
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index fdc79c8ce..86ce3dfb0 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -20,11 +20,11 @@ endif()
 
 
 
-if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
+if(MINGW)
     execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
-              OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
+              OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE
               OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
+    if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
       set(MINGW64 1)
     endif()
 endif()
@@ -35,9 +35,11 @@ if(CMAKE_CL_64 OR MINGW64)
 elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
   set(X86 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
-  set(PPC 1)
+  set(POWER 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
+  set(LOONGARCH64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@@ -71,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING})
     else ()
        set(X86 1)
     endif()
+   elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*")
+       set(MIPS32 1)
    elseif (${TARGET} STREQUAL "ARMV7")
        set(ARM 1)
    else()
@@ -84,8 +88,12 @@ if (X86_64)
   set(ARCH "x86_64")
 elseif(X86)
   set(ARCH "x86")
-elseif(PPC)
+elseif(POWER)
   set(ARCH "power")
+elseif(MIPS32)
+  set(ARCH "mips")
+elseif(MIPS64)
+  set(ARCH "mips64")
 elseif(ARM)
   set(ARCH "arm")
 elseif(ARM64)
@@ -95,7 +103,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR PPC OR MIPS64)
+  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
     set(BINARY 64)
   else ()
     set(BINARY 32)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 8f25c1b27..56c1cb060 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -15,35 +15,83 @@ endfunction ()
 # Reads a Makefile into CMake vars.
 macro(ParseMakefileVars MAKEFILE_IN)
   message(STATUS "Reading vars from ${MAKEFILE_IN}...")
-        set (IfElse 0)
-        set (ElseSeen 0)
+  set (C_COMPILER ${CMAKE_C_COMPILER_ID})
+  set (IfElse 0)
+  set (ElseSeen 0)
+  set (SkipIfs 0)
+  set (SkipElse 0)
   file(STRINGS ${MAKEFILE_IN} makefile_contents)
   foreach (makefile_line ${makefile_contents})
-#message(STATUS "parsing ${makefile_line}")
+    #message(STATUS "parsing ${makefile_line}")
+    # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition.
+    # The variable SkipIfs is used to identify which endif statement closes the scope of the else statement.
+    if (${SkipElse} EQUAL 1)
+      #message(STATUS "skipping ${makefile_line}")
+      string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
+      if (NOT "${line_match}" STREQUAL "")
+        MATH(EXPR SkipIfs "${SkipIfs}+1")
+      endif ()
+      string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
+      if (NOT "${line_match}" STREQUAL "")
+        if (${SkipIfs} EQUAL 0)
+          set (SkipElse 0)
+        else ()
+          MATH(EXPR SkipIfs "${SkipIfs}-1")
+        endif ()
+      endif ()
+      continue ()
+    endif ()
+    # The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement.
     if (${IfElse} GREATER 0)
+      # If the current scope is the one that has to be skipped, the if/endif/else statements
+      # along with it till the endif that closes the current scope have to be ignored as well.
+      string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
+      if (NOT "${line_match}" STREQUAL "")
+        if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
+          #message(STATUS "skipping ${makefile_line}")
+          MATH(EXPR SkipIfs "${SkipIfs}+1")
+          continue ()
+        endif ()
+      endif ()
       string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
       if (NOT "${line_match}" STREQUAL "")
-#           message(STATUS "ENDIF ${makefile_line}")
-        set (IfElse 0)
-        set (ElseSeen 0)
+        if (${SkipIfs} EQUAL 0)
+          #message(STATUS "ENDIF ${makefile_line}")
+          set (IfElse 0)
+          set (ElseSeen 0)
+        else ()
+          #message(STATUS "skipping ${makefile_line}")
+          MATH(EXPR SkipIfs "${SkipIfs}-1")
+        endif ()
         continue ()
       endif ()
       string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
-        if (NOT "${line_match}" STREQUAL "") 
-#           message(STATUS "ELSE ${makefile_line}")
-           set (ElseSeen 1)        
-           continue ()  
-        endif()
-      if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
-#           message(STATUS "skipping ${makefile_line}")
-         continue ()
+      if (NOT "${line_match}" STREQUAL "") 
+        if (${SkipIfs} EQUAL 0)
+          #message(STATUS "ELSE ${makefile_line}")
+          set (ElseSeen 1)        
+        else ()
+          #message(STATUS "skipping ${makefile_line}")
+        endif ()
+        continue ()  
+      endif()
+      # Skip the lines that are not part of the path that has to be taken.
+      if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0))
+        #message(STATUS "skipping ${makefile_line}")
+        continue ()
       endif ()    
-    endif ()    
+    endif ()
+    # Skip commented lines (the ones that start with '#')
+    string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      #message(STATUS "skipping ${makefile_line}")
+      continue ()
+    endif ()
     string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
     if (NOT "${line_match}" STREQUAL "")
-#message(STATUS "match on ${line_match}")
+      #message(STATUS "match on ${line_match}")
       set(var_name ${CMAKE_MATCH_1})
-#      set(var_value ${CMAKE_MATCH_2})
+      #set(var_value ${CMAKE_MATCH_2})
       string(STRIP ${CMAKE_MATCH_2} var_value)
       # check for Makefile variables in the string, e.g. $(TSUFFIX)
       string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
@@ -54,36 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN)
         string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value})
       endforeach ()
       set(${var_name} ${var_value})
-    else ()
-      string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
-      if (NOT "${line_match}" STREQUAL "")
-#message(STATUS "match on include ${line_match}")
-        ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
+      continue ()
+    endif ()
+    # Include a new file to be parsed
+    string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      #message(STATUS "match on include ${line_match}")
+      ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
+      continue ()
+    endif ()
+    # The if statement that precedes this else has the path taken
+    # Thus, this else statement has to be skipped.
+    string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "") 
+      #message(STATUS "skipping ${makefile_line}")
+      set (SkipElse 1)
+      continue()
+    endif()
+    # Example 1: ifdef HAVE_MSA
+    # Example 2: ifndef ZNRM2KERNEL
+    string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
+      set (ElseSeen 0)
+      if (${CMAKE_MATCH_2})
+        if (${CMAKE_MATCH_1} STREQUAL "ifdef")
+          #message (STATUS "condition is true")
+          set (IfElse 1)
+        else ()
+          set (IfElse 2)
+        endif ()
       else ()
-#        message(STATUS "unmatched line ${line_match}")
-        string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
-        if (NOT "${line_match}" STREQUAL "")
-#          message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
-          if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
-#            message (STATUS "condition is true")
-            set (IfElse 1)
-          else ()
-            set (IfElse 2)
-          endif ()
+        if (${CMAKE_MATCH_1} STREQUAL "ifdef")
+          set (IfElse 2)
         else ()
-          string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
-          if (NOT "${line_match}" STREQUAL "")
-#            message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
-            if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
-#              message (STATUS "condition is true")
-              set (IfElse 1)
-            else ()
-              set (IfElse 2)
-            endif ()
-          endif ()
+          #message (STATUS "condition is true")
+          set (IfElse 1)
+        endif ()
+      endif ()
+      continue ()
+    endif ()
+    # Example 1: ifeq ($(SGEMM_UNROLL_M), 16)
+    # Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
+    # Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
+    # Ignore the second group since (?:...) does not work on cmake
+    string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      #message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}")
+      if (DEFINED ${CMAKE_MATCH_1})
+        if (DEFINED ${CMAKE_MATCH_4})
+          set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}})
+        else ()
+          set (STR ${${CMAKE_MATCH_1}})
+        endif ()
+        if (${STR} STREQUAL ${CMAKE_MATCH_5})
+          #message (STATUS "condition is true")
+          set (IfElse 1)
+          continue ()
         endif ()
       endif ()
+      set (IfElse 2)
+      continue ()
     endif ()
+    # Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+    # Example 2 (Group 4): ifneq ($(C_COMPILER), PGI)
+    string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      #message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}")
+      set (ElseSeen 0)
+      set (HasValidGroup 0)
+      if (DEFINED ${CMAKE_MATCH_3})
+        set (HasValidGroup 1)
+        set (STR ${${CMAKE_MATCH_3}})
+      elseif (NOT ${CMAKE_MATCH_4} STREQUAL "")
+        set (HasValidGroup 1)
+        set (STR ${CMAKE_MATCH_4})
+      endif ()
+      if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
+        if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
+          #message (STATUS "condition is true")
+          set (IfElse 1)
+          continue ()
+        endif ()
+      endif ()
+      set (IfElse 2)
+      continue ()
+    endif ()
+    #message(STATUS "unmatched line ${line_match}")
   endforeach ()
 endmacro ()
 
@@ -154,31 +259,31 @@ endfunction ()
 #                               STRING - compiles only the given type (e.g. DOUBLE)
 function(GenerateNamedObjects sources_in)
 
-  if (DEFINED ARGV1)
+  if (${ARGC} GREATER 1)
     set(defines_in ${ARGV1})
   endif ()
 
-  if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "")
+  if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "")
     set(name_in ${ARGV2})
     # strip off extension for kernel files that pass in the object name.
     get_filename_component(name_in ${name_in} NAME_WE)
   endif ()
 
-  if (DEFINED ARGV3)
+  if (${ARGC} GREATER 3)
     set(use_cblas ${ARGV3})
   else ()
     set(use_cblas false)
   endif ()
 
-  if (DEFINED ARGV4)
+  if (${ARGC} GREATER 4)
     set(replace_last_with ${ARGV4})
   endif ()
 
-  if (DEFINED ARGV5)
+  if (${ARGC} GREATER 5)
     set(append_with ${ARGV5})
   endif ()
 
-  if (DEFINED ARGV6)
+  if (${ARGC} GREATER 6)
     set(no_float_type ${ARGV6})
   else ()
     set(no_float_type false)
@@ -193,7 +298,7 @@ function(GenerateNamedObjects sources_in)
   set(real_only false)
   set(complex_only false)
   set(mangle_complex_sources false)
-  if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "")
+  if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "")
     if (${ARGV7} EQUAL 1)
       set(real_only true)
     elseif (${ARGV7} EQUAL 2)
@@ -251,6 +356,19 @@ function(GenerateNamedObjects sources_in)
       # now add the object and set the defines
       set(obj_defines ${defines_in})
 
+      list(FIND obj_defines "RC" def_idx)
+      if (${def_idx} GREATER -1) 
+	      #	      list(REMOVE_AT ${obj_defines} ${def_idx})
+	      list (REMOVE_ITEM obj_defines "RC")
+	      list(APPEND obj_defines  "RC=RC")
+      endif ()
+      list(FIND obj_defines "CR" def_idx)
+      if (${def_idx} GREATER -1) 
+	      #	      list(REMOVE_AT ${obj_defines} ${def_idx})
+	      list (REMOVE_ITEM obj_defines "CR")
+	      list(APPEND obj_defines  "CR=CR")
+      endif ()
+
       if (use_cblas)
         set(obj_name "cblas_${obj_name}")
         list(APPEND obj_defines "CBLAS")
@@ -295,7 +413,15 @@ function(GenerateNamedObjects sources_in)
       configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
       file(REMOVE ${new_source_file}.tmp)
       list(APPEND SRC_LIST_OUT ${new_source_file})
-
+      message (STATUS ${new_source_file})
+      if (DEFINED HAVE_FMA3)
+        if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
+		set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
+        endif ()
+        if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
+		set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
+        endif ()
+      endif ()
     endforeach ()
   endforeach ()
 
@@ -318,17 +444,17 @@ endfunction ()
 function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme)
 
   set(alternate_name_in "")
-  if (DEFINED ARGV5)
+  if (${ARGC} GREATER 5)
     set(alternate_name_in ${ARGV5})
   endif ()
 
   set(no_float_type false)
-  if (DEFINED ARGV6)
+  if (${ARGC} GREATER 6)
     set(no_float_type ${ARGV6})
   endif ()
 
   set(complex_filename_scheme "")
-  if (DEFINED ARGV7)
+  if (${ARGC} GREATER 7)
     set(complex_filename_scheme ${ARGV7})
   endif ()
 
diff --git a/common.h b/common.h
index 2825407cb..00d1d0baf 100644
--- a/common.h
+++ b/common.h
@@ -122,7 +122,7 @@ extern "C" {
 #define ATOM GOTO_ATOM
 #undef  GOTO_ATOM
 #endif
-#else
+#elif !defined(OS_EMBEDDED)
 #include <sys/mman.h>
 #ifndef NO_SYSV_IPC
 #include <sys/shm.h>
@@ -134,6 +134,9 @@ extern "C" {
 #if defined(SMP) || defined(USE_LOCKING)
 #include <pthread.h>
 #endif
+#else
+#include <time.h>
+#include <math.h>
 #endif
 
 #if defined(OS_SUNOS)
@@ -413,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_alpha.h"
 #endif
 
+#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+#ifndef _CET_ENDBR
+#define _CET_ENDBR
+#endif
+
 #ifdef ARCH_X86
 #include "common_x86.h"
 #endif
@@ -437,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips.h"
 #endif
 
-    
+
 #ifdef ARCH_RISCV64
 #include "common_riscv64.h"
 #endif
@@ -458,6 +470,14 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_zarch.h"
 #endif
 
+#ifdef ARCH_LOONGARCH64
+#include "common_loongarch64.h"
+#endif
+
+#ifdef ARCH_E2K
+#include "common_e2k.h"
+#endif
+
 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
@@ -488,10 +508,12 @@ static inline unsigned long long rpcc(void){
   struct timespec ts;
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
-#else
+#elif !defined(OS_EMBEDDED)
   struct timeval tv;
   gettimeofday(&tv,NULL);
   return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
+#else
+  return 0;
 #endif
 }
 #define RPCC_DEFINED
@@ -521,6 +543,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
 #include "common_linux.h"
 #endif
 
+#ifdef OS_EMBEDDED
+#define DTB_DEFAULT_ENTRIES 64
+#endif
+
 #define MMAP_ACCESS (PROT_READ | PROT_WRITE)
 
 #ifdef __NetBSD__
diff --git a/common_arm64.h b/common_arm64.h
index 9cdded305..029e23886 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define INLINE inline
 
-#ifdef F_INTERFACE_FLANG
+#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
 #define RETURN_BY_STACK
 #else
 #define RETURN_BY_COMPLEX
@@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 	.text ;
 	.p2align 2 ;
 	.global	REALNAME ;
-#ifndef __APPLE__
+#if !defined(__APPLE__) && !defined(_WIN32)
 	.type	REALNAME, %function ;
 #endif
 REALNAME:
diff --git a/common_c.h b/common_c.h
index 40ecf5b8b..6cff610bb 100644
--- a/common_c.h
+++ b/common_c.h
@@ -232,6 +232,8 @@
 
 #define CGEADD_K                cgeadd_k 
 
+#define CGEMM_SMALL_MATRIX_PERMIT	cgemm_small_matrix_permit
+
 #else
 
 #define	CAMAX_K			gotoblas -> camax_k
@@ -426,8 +428,51 @@
 
 #define CGEADD_K                gotoblas -> cgeadd_k 
 
+#define CGEMM_SMALL_MATRIX_PERMIT	gotoblas -> cgemm_small_matrix_permit
+
 #endif
 
+#define CGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(cgemm_small_kernel_nn)
+#define CGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(cgemm_small_kernel_nt)
+#define CGEMM_SMALL_KERNEL_NR		FUNC_OFFSET(cgemm_small_kernel_nr)
+#define CGEMM_SMALL_KERNEL_NC		FUNC_OFFSET(cgemm_small_kernel_nc)
+
+#define CGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(cgemm_small_kernel_tn)
+#define CGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(cgemm_small_kernel_tt)
+#define CGEMM_SMALL_KERNEL_TR		FUNC_OFFSET(cgemm_small_kernel_tr)
+#define CGEMM_SMALL_KERNEL_TC		FUNC_OFFSET(cgemm_small_kernel_tc)
+
+#define CGEMM_SMALL_KERNEL_RN		FUNC_OFFSET(cgemm_small_kernel_rn)
+#define CGEMM_SMALL_KERNEL_RT		FUNC_OFFSET(cgemm_small_kernel_rt)
+#define CGEMM_SMALL_KERNEL_RR		FUNC_OFFSET(cgemm_small_kernel_rr)
+#define CGEMM_SMALL_KERNEL_RC		FUNC_OFFSET(cgemm_small_kernel_rc)
+
+#define CGEMM_SMALL_KERNEL_CN		FUNC_OFFSET(cgemm_small_kernel_cn)
+#define CGEMM_SMALL_KERNEL_CT		FUNC_OFFSET(cgemm_small_kernel_ct)
+#define CGEMM_SMALL_KERNEL_CR		FUNC_OFFSET(cgemm_small_kernel_cr)
+#define CGEMM_SMALL_KERNEL_CC		FUNC_OFFSET(cgemm_small_kernel_cc)
+
+#define CGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(cgemm_small_kernel_b0_nn)
+#define CGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(cgemm_small_kernel_b0_nt)
+#define CGEMM_SMALL_KERNEL_B0_NR	FUNC_OFFSET(cgemm_small_kernel_b0_nr)
+#define CGEMM_SMALL_KERNEL_B0_NC	FUNC_OFFSET(cgemm_small_kernel_b0_nc)
+
+#define CGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(cgemm_small_kernel_b0_tn)
+#define CGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(cgemm_small_kernel_b0_tt)
+#define CGEMM_SMALL_KERNEL_B0_TR	FUNC_OFFSET(cgemm_small_kernel_b0_tr)
+#define CGEMM_SMALL_KERNEL_B0_TC	FUNC_OFFSET(cgemm_small_kernel_b0_tc)
+
+#define CGEMM_SMALL_KERNEL_B0_RN	FUNC_OFFSET(cgemm_small_kernel_b0_rn)
+#define CGEMM_SMALL_KERNEL_B0_RT	FUNC_OFFSET(cgemm_small_kernel_b0_rt)
+#define CGEMM_SMALL_KERNEL_B0_RR	FUNC_OFFSET(cgemm_small_kernel_b0_rr)
+#define CGEMM_SMALL_KERNEL_B0_RC	FUNC_OFFSET(cgemm_small_kernel_b0_rc)
+
+#define CGEMM_SMALL_KERNEL_B0_CN	FUNC_OFFSET(cgemm_small_kernel_b0_cn)
+#define CGEMM_SMALL_KERNEL_B0_CT	FUNC_OFFSET(cgemm_small_kernel_b0_ct)
+#define CGEMM_SMALL_KERNEL_B0_CR	FUNC_OFFSET(cgemm_small_kernel_b0_cr)
+#define CGEMM_SMALL_KERNEL_B0_CC	FUNC_OFFSET(cgemm_small_kernel_b0_cc)
+
+
 #define	CGEMM_NN		cgemm_nn
 #define	CGEMM_CN		cgemm_cn
 #define	CGEMM_TN		cgemm_tn
diff --git a/common_d.h b/common_d.h
index 94dc3eea8..6f4bb2ded 100644
--- a/common_d.h
+++ b/common_d.h
@@ -157,6 +157,8 @@
 #define DIMATCOPY_K_RT      dimatcopy_k_rt
 #define DGEADD_K                dgeadd_k 
 
+#define DGEMM_SMALL_MATRIX_PERMIT	dgemm_small_matrix_permit
+
 #else
 
 #define	DAMAX_K			gotoblas -> damax_k
@@ -281,8 +283,21 @@
 
 #define DGEADD_K                gotoblas -> dgeadd_k 
 
+#define DGEMM_SMALL_MATRIX_PERMIT	gotoblas -> dgemm_small_matrix_permit
+
 #endif
 
+#define DGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(dgemm_small_kernel_nn)
+#define DGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(dgemm_small_kernel_nt)
+#define DGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(dgemm_small_kernel_tn)
+#define DGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(dgemm_small_kernel_tt)
+
+#define DGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(dgemm_small_kernel_b0_nn)
+#define DGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(dgemm_small_kernel_b0_nt)
+#define DGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(dgemm_small_kernel_b0_tn)
+#define DGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(dgemm_small_kernel_b0_tt)
+
+
 #define	DGEMM_NN		dgemm_nn
 #define	DGEMM_CN		dgemm_tn
 #define	DGEMM_TN		dgemm_tn
diff --git a/common_e2k.h b/common_e2k.h
new file mode 100644
index 000000000..0739c9473
--- /dev/null
+++ b/common_e2k.h
@@ -0,0 +1,64 @@
+/*****************************************************************************
+Copyright (c) 2011-2016, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#ifndef COMMON_E2K
+#define COMMON_E2K
+
+#ifdef ASSEMBLER
+#error
+#endif
+
+#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
+#define RMB
+
+#define INLINE __attribute__((__always_inline__)) inline
+
+static inline int blas_quickdivide(blasint x, blasint y) {
+  return x / y;
+}
+
+#ifndef PAGESIZE
+#define PAGESIZE	( 4 << 10)
+#endif
+#define HUGE_PAGESIZE	( 2 << 20)
+
+#ifndef BUFFERSIZE
+#define BUFFER_SIZE	(32 << 20)
+#else
+#define BUFFER_SIZE	(32 << BUFFERSIZE)
+#endif
+
+#define SEEK_ADDRESS
+
+#endif
+
diff --git a/common_interface.h b/common_interface.h
index b9ebb2772..318827920 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
 
+int BLASFUNC(spotri)(char *, blasint *, float  *, blasint *, blasint *);
+int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
+int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
+int BLASFUNC(cpotri)(char *, blasint *, float  *, blasint *, blasint *);
+int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
+int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
+
 int BLASFUNC(spotrs)(char *, blasint *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
 int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
 int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
diff --git a/common_level3.h b/common_level3.h
index c4f9435a9..5080ada10 100644
--- a/common_level3.h
+++ b/common_level3.h
@@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
 int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
 #endif
 
+#ifdef SMALL_MATRIX_OPT
+int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
+
+int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+
+int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+
+int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+
+int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+
+int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
+
+int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+	
+int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
+
+int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+	
+int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);	
+int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+	
+int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);	
+int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+	
+int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+#endif
+
 int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
 int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
 int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float,  float,  float  *, float  *, float  *, BLASLONG);
diff --git a/common_linux.h b/common_linux.h
index 35f3fb658..5a1c4e150 100644
--- a/common_linux.h
+++ b/common_linux.h
@@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
 // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
         return 0;
 #else
-#if defined (LOONGSON3B)
-#if defined (__64BIT__)
-	return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
-#else
-	return 0; //NULL Implementation on Loongson 3B 32bit.
-#endif
-#else
 //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
 //	unsigned long null_nodemask=0;
 	return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
 #endif
-#endif
 }
 
 static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
diff --git a/common_loongarch64.h b/common_loongarch64.h
new file mode 100644
index 000000000..e15539b5f
--- /dev/null
+++ b/common_loongarch64.h
@@ -0,0 +1,199 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_LOONGARCH64
+#define COMMON_LOONGARCH64
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#ifdef DOUBLE
+#define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
+#else
+#define GET_IMAGE(res)  __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res)  : : "memory")
+#endif
+
+#define GET_IMAGE_CANCEL
+
+#else
+
+#ifdef DOUBLE
+#define LD      fld.d
+#define ST      fst.d
+#define MADD    fmadd.d
+#define NMADD   fnmadd.d
+#define MSUB    fmsub.d
+#define NMSUB   fnmsub.d
+#define ADD     fadd.d
+#define SUB     fsub.d
+#define MUL     fmul.d
+#define MOV     fmov.d
+#define CMOVT   fsel
+#define MTC     movgr2fr.d
+#define FABS    fabs.d
+#define CMPEQ   fcmp.ceq.d
+#define CMPLE   fcmp.cle.d
+#define CMPLT   fcmp.clt.d
+#define NEG     fneg.d
+#else
+#define LD      fld.s
+#define ST      fst.s
+#define MADD    fmadd.s
+#define NMADD   fnmadd.s
+#define MSUB    fmsub.s
+#define NMSUB   fnmsub.s
+#define ADD     fadd.s
+#define SUB     fsub.s
+#define MUL     fmul.s
+#define MOV     fmov.s
+#define CMOVT   fsel
+#define MTC     movgr2fr.w
+#define FABS    fabs.s
+#define CMPEQ   fcmp.ceq.s
+#define CMPLE   fcmp.cle.s
+#define CMPLT   fcmp.clt.s
+#define NEG     fneg.s
+#endif /* defined(DOUBLE) */
+
+#if defined(__64BIT__) && defined(USE64BITINT)
+#define LDINT   ld.d
+#define LDARG   ld.d
+#define SDARG   st.d
+#elif defined(__64BIT__) && !defined(USE64BITINT)
+#define LDINT   ld.w
+#define LDARG   ld.d
+#define SDARG   st.d
+#else
+#define LDINT   ld.w
+#define LDARG   ld.w
+#define SDARG   st.w
+#endif
+
+
+#ifndef F_INTERFACE
+#define REALNAME ASMNAME
+#else
+#define REALNAME ASMFNAME
+#endif /* defined(F_INTERFACE) */
+
+#if defined(ASSEMBLER) && !defined(NEEDPARAM)
+
+#define PROLOGUE \
+    .text ;\
+    .align 5 ;\
+    .globl  REALNAME ;\
+    .type   REALNAME, @function ;\
+REALNAME: ;\
+
+#if defined(__linux__) && defined(__ELF__)
+#define GNUSTACK .section .note.GNU-stack,"",@progbits
+#else
+#define GNUSTACK
+#endif /* defined(__linux__) && defined(__ELF__) */
+
+#define EPILOGUE      \
+    .end    REALNAME ;\
+    GNUSTACK
+
+#define PROFCODE
+
+#define MOVT(dst, src, cc)  \
+    bceqz  cc,   1f;        \
+    add.d dst,  src,  $r0;  \
+    1:
+
+#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
+
+#endif /* defined(ASSEMBLER) */
+
+#define SEEK_ADDRESS
+
+#define BUFFER_SIZE     ( 32 << 20)
+
+#define PAGESIZE        (16UL << 10)
+#define FIXED_PAGESIZE  (16UL << 10)
+#define HUGE_PAGESIZE   ( 2 << 20)
+
+#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#endif
diff --git a/common_macro.h b/common_macro.h
index c6ea1bfd9..9826f1809 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -644,6 +644,17 @@
 
 #define GEADD_K                 DGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	DGEMM_SMALL_MATRIX_PERMIT
+
+#define GEMM_SMALL_KERNEL_NN    DGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    DGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    DGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    DGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_B0_NN    DGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    DGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    DGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    DGEMM_SMALL_KERNEL_B0_TT
+
 #elif defined(BFLOAT16)
 
 #define D_TO_BF16_K     SBDTOBF16_K
@@ -931,6 +942,18 @@
 
 #define GEADD_K 		SGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	SBGEMM_SMALL_MATRIX_PERMIT
+
+#define GEMM_SMALL_KERNEL_NN    SBGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    SBGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    SBGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    SBGEMM_SMALL_KERNEL_TT
+
+#define GEMM_SMALL_KERNEL_B0_NN    SBGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    SBGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    SBGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    SBGEMM_SMALL_KERNEL_B0_TT
+
 #endif
 
 #else
@@ -1236,6 +1259,19 @@
 #define IMATCOPY_K_RT		SIMATCOPY_K_RT
 
 #define GEADD_K 		SGEADD_K
+
+#define GEMM_SMALL_MATRIX_PERMIT	SGEMM_SMALL_MATRIX_PERMIT
+
+#define GEMM_SMALL_KERNEL_NN    SGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    SGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_TN    SGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    SGEMM_SMALL_KERNEL_TT
+
+#define GEMM_SMALL_KERNEL_B0_NN    SGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    SGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_TN    SGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    SGEMM_SMALL_KERNEL_B0_TT
+
 #endif
 #else
 #ifdef XDOUBLE
@@ -2063,6 +2099,48 @@
 
 #define GEADD_K                 ZGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	ZGEMM_SMALL_MATRIX_PERMIT
+
+#define GEMM_SMALL_KERNEL_NN    ZGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    ZGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_NR    ZGEMM_SMALL_KERNEL_NR
+#define GEMM_SMALL_KERNEL_NC    ZGEMM_SMALL_KERNEL_NC
+
+#define GEMM_SMALL_KERNEL_TN    ZGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    ZGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_TR    ZGEMM_SMALL_KERNEL_TR
+#define GEMM_SMALL_KERNEL_TC    ZGEMM_SMALL_KERNEL_TC
+
+#define GEMM_SMALL_KERNEL_RN    ZGEMM_SMALL_KERNEL_RN
+#define GEMM_SMALL_KERNEL_RT    ZGEMM_SMALL_KERNEL_RT
+#define GEMM_SMALL_KERNEL_RR    ZGEMM_SMALL_KERNEL_RR
+#define GEMM_SMALL_KERNEL_RC    ZGEMM_SMALL_KERNEL_RC
+
+#define GEMM_SMALL_KERNEL_CN    ZGEMM_SMALL_KERNEL_CN
+#define GEMM_SMALL_KERNEL_CT    ZGEMM_SMALL_KERNEL_CT
+#define GEMM_SMALL_KERNEL_CR    ZGEMM_SMALL_KERNEL_CR
+#define GEMM_SMALL_KERNEL_CC    ZGEMM_SMALL_KERNEL_CC
+
+#define GEMM_SMALL_KERNEL_B0_NN    ZGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    ZGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_NR    ZGEMM_SMALL_KERNEL_B0_NR
+#define GEMM_SMALL_KERNEL_B0_NC    ZGEMM_SMALL_KERNEL_B0_NC
+
+#define GEMM_SMALL_KERNEL_B0_TN    ZGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    ZGEMM_SMALL_KERNEL_B0_TT
+#define GEMM_SMALL_KERNEL_B0_TR    ZGEMM_SMALL_KERNEL_B0_TR
+#define GEMM_SMALL_KERNEL_B0_TC    ZGEMM_SMALL_KERNEL_B0_TC
+
+#define GEMM_SMALL_KERNEL_B0_RN    ZGEMM_SMALL_KERNEL_B0_RN
+#define GEMM_SMALL_KERNEL_B0_RT    ZGEMM_SMALL_KERNEL_B0_RT
+#define GEMM_SMALL_KERNEL_B0_RR    ZGEMM_SMALL_KERNEL_B0_RR
+#define GEMM_SMALL_KERNEL_B0_RC    ZGEMM_SMALL_KERNEL_B0_RC
+
+#define GEMM_SMALL_KERNEL_B0_CN    ZGEMM_SMALL_KERNEL_B0_CN
+#define GEMM_SMALL_KERNEL_B0_CT    ZGEMM_SMALL_KERNEL_B0_CT
+#define GEMM_SMALL_KERNEL_B0_CR    ZGEMM_SMALL_KERNEL_B0_CR
+#define GEMM_SMALL_KERNEL_B0_CC    ZGEMM_SMALL_KERNEL_B0_CC
+
 #else
 
 #define	AMAX_K			CAMAX_K
@@ -2486,11 +2564,54 @@
 
 #define GEADD_K                 CGEADD_K
 
+#define GEMM_SMALL_MATRIX_PERMIT	CGEMM_SMALL_MATRIX_PERMIT
+
+#define GEMM_SMALL_KERNEL_NN    CGEMM_SMALL_KERNEL_NN
+#define GEMM_SMALL_KERNEL_NT    CGEMM_SMALL_KERNEL_NT
+#define GEMM_SMALL_KERNEL_NR    CGEMM_SMALL_KERNEL_NR
+#define GEMM_SMALL_KERNEL_NC    CGEMM_SMALL_KERNEL_NC
+
+#define GEMM_SMALL_KERNEL_TN    CGEMM_SMALL_KERNEL_TN
+#define GEMM_SMALL_KERNEL_TT    CGEMM_SMALL_KERNEL_TT
+#define GEMM_SMALL_KERNEL_TR    CGEMM_SMALL_KERNEL_TR
+#define GEMM_SMALL_KERNEL_TC    CGEMM_SMALL_KERNEL_TC
+
+#define GEMM_SMALL_KERNEL_RN    CGEMM_SMALL_KERNEL_RN
+#define GEMM_SMALL_KERNEL_RT    CGEMM_SMALL_KERNEL_RT
+#define GEMM_SMALL_KERNEL_RR    CGEMM_SMALL_KERNEL_RR
+#define GEMM_SMALL_KERNEL_RC    CGEMM_SMALL_KERNEL_RC
+
+#define GEMM_SMALL_KERNEL_CN    CGEMM_SMALL_KERNEL_CN
+#define GEMM_SMALL_KERNEL_CT    CGEMM_SMALL_KERNEL_CT
+#define GEMM_SMALL_KERNEL_CR    CGEMM_SMALL_KERNEL_CR
+#define GEMM_SMALL_KERNEL_CC    CGEMM_SMALL_KERNEL_CC
+
+#define GEMM_SMALL_KERNEL_B0_NN    CGEMM_SMALL_KERNEL_B0_NN
+#define GEMM_SMALL_KERNEL_B0_NT    CGEMM_SMALL_KERNEL_B0_NT
+#define GEMM_SMALL_KERNEL_B0_NR    CGEMM_SMALL_KERNEL_B0_NR
+#define GEMM_SMALL_KERNEL_B0_NC    CGEMM_SMALL_KERNEL_B0_NC
+
+#define GEMM_SMALL_KERNEL_B0_TN    CGEMM_SMALL_KERNEL_B0_TN
+#define GEMM_SMALL_KERNEL_B0_TT    CGEMM_SMALL_KERNEL_B0_TT
+#define GEMM_SMALL_KERNEL_B0_TR    CGEMM_SMALL_KERNEL_B0_TR
+#define GEMM_SMALL_KERNEL_B0_TC    CGEMM_SMALL_KERNEL_B0_TC
+
+#define GEMM_SMALL_KERNEL_B0_RN    CGEMM_SMALL_KERNEL_B0_RN
+#define GEMM_SMALL_KERNEL_B0_RT    CGEMM_SMALL_KERNEL_B0_RT
+#define GEMM_SMALL_KERNEL_B0_RR    CGEMM_SMALL_KERNEL_B0_RR
+#define GEMM_SMALL_KERNEL_B0_RC    CGEMM_SMALL_KERNEL_B0_RC
+
+#define GEMM_SMALL_KERNEL_B0_CN    CGEMM_SMALL_KERNEL_B0_CN
+#define GEMM_SMALL_KERNEL_B0_CT    CGEMM_SMALL_KERNEL_B0_CT
+#define GEMM_SMALL_KERNEL_B0_CR    CGEMM_SMALL_KERNEL_B0_CR
+#define GEMM_SMALL_KERNEL_B0_CC    CGEMM_SMALL_KERNEL_B0_CC
+
 #endif
 #endif
 
 #ifndef ASSEMBLER
-#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
+#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
+|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
 extern BLASLONG gemm_offset_a;
 extern BLASLONG gemm_offset_b;
 extern BLASLONG sbgemm_p;
diff --git a/common_mips64.h b/common_mips64.h
index a06edfe08..287459e7d 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -229,12 +229,7 @@ REALNAME: ;\
 
 #define BUFFER_SIZE     ( 32 << 21)
 
-#if defined(LOONGSON3A)
-#define PAGESIZE	(16UL << 10)
-#define FIXED_PAGESIZE	(16UL << 10)
-#endif
-
-#if defined(LOONGSON3B)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #define PAGESIZE	(16UL << 10)
 #define FIXED_PAGESIZE	(16UL << 10)
 #endif
@@ -250,7 +245,7 @@ REALNAME: ;\
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
-#if defined(LOONGSON3A) || defined(LOONGSON3B)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #define PREFETCHD_(x) ld $0, x
 #define PREFETCHD(x)  PREFETCHD_(x)
 #else
diff --git a/common_param.h b/common_param.h
index 3e3ae06f8..31fba9059 100644
--- a/common_param.h
+++ b/common_param.h
@@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
   int	 (*sbneg_tcopy)   (BLASLONG, BLASLONG, float *, BLASLONG, float *);
   int    (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
 
+#ifdef SMALL_MATRIX_OPT
+  int    (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+  int    (*sbgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+  int    (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
+#endif
 #endif
 
 #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
@@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
   int    (*sgemm_otcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
 #endif
 #ifdef BUILD_SINGLE  
+#ifdef SMALL_MATRIX_OPT
+  int    (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
+
+  int    (*sgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
+
+  int    (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+  int    (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
+#endif
+
   int    (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
@@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
   int    (*dgemm_otcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
 #endif
 #ifdef BUILD_DOUBLE
+#ifdef SMALL_MATRIX_OPT
+  int    (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
+
+  int    (*dgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
+
+  int    (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+  int    (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
+#endif
   int    (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
@@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
   int    (*cgemm_oncopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
   int    (*cgemm_otcopy   )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
 
+#ifdef SMALL_MATRIX_OPT
+  int    (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
+
+  int    (*cgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_nr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_nc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_tr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_tc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_rn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_rt    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_rr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_rc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_cn    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_ct    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_cr    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_cc    )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+
+  int    (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+  int    (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb,  float * C, BLASLONG ldc);
+#endif
+
   int    (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
   int    (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
@@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
   int    (*zgemm_oncopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
   int    (*zgemm_otcopy   )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
 
+#ifdef SMALL_MATRIX_OPT
+  int    (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
+
+  int    (*zgemm_small_kernel_nn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_nt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_nr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_nc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_tn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_tt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_tr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_tc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_rn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_rt    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_rr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_rc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_cn    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_ct    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_cr    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_cc    )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+
+  int    (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+  int    (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb,  double * C, BLASLONG ldc);
+#endif
+
   int    (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
   int    (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
@@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
 
 extern gotoblas_t *gotoblas;
 
+#define FUNC_OFFSET(func)	(size_t)(&((gotoblas_t *)NULL)->func)
+
 #define DTB_ENTRIES  gotoblas -> dtb_entries
 #define GEMM_OFFSET_A	gotoblas -> offsetA
 #define GEMM_OFFSET_B	gotoblas -> offsetB
@@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas;
 
 #else
 
+#define FUNC_OFFSET(func)	(size_t)(func)
+
 #define DTB_ENTRIES  DTB_DEFAULT_ENTRIES
 
 #define GEMM_OFFSET_A	GEMM_DEFAULT_OFFSET_A
diff --git a/common_s.h b/common_s.h
index 34903ec49..fdd80b62f 100644
--- a/common_s.h
+++ b/common_s.h
@@ -164,6 +164,8 @@
 
 #define SGEADD_K                sgeadd_k 
 
+#define SGEMM_SMALL_MATRIX_PERMIT	sgemm_small_matrix_permit
+
 #else
 
 #define	SAMAX_K			gotoblas -> samax_k
@@ -299,8 +301,21 @@
 
 #define SGEADD_K                gotoblas -> sgeadd_k 
 
+#define SGEMM_SMALL_MATRIX_PERMIT	gotoblas -> sgemm_small_matrix_permit
+
 #endif
 
+#define SGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(sgemm_small_kernel_nn)
+#define SGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(sgemm_small_kernel_nt)
+#define SGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(sgemm_small_kernel_tn)
+#define SGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(sgemm_small_kernel_tt)
+
+#define SGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(sgemm_small_kernel_b0_nn)
+#define SGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(sgemm_small_kernel_b0_nt)
+#define SGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(sgemm_small_kernel_b0_tn)
+#define SGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(sgemm_small_kernel_b0_tt)
+
+
 #define	SGEMM_NN		sgemm_nn
 #define	SGEMM_CN		sgemm_tn
 #define	SGEMM_TN		sgemm_tn
diff --git a/common_sb.h b/common_sb.h
index 9976e812e..d21e7a563 100644
--- a/common_sb.h
+++ b/common_sb.h
@@ -24,6 +24,7 @@
 #define	SBGEMM_BETA		sbgemm_beta
 #define SBGEMM_KERNEL            sbgemm_kernel
 
+#define SBGEMM_SMALL_MATRIX_PERMIT	sbgemm_small_matrix_permit
 #else
 
 #define SBDOT_K             gotoblas -> sbdot_k
@@ -41,8 +42,19 @@
 #define	SBGEMM_BETA		gotoblas -> sbgemm_beta
 #define	SBGEMM_KERNEL		gotoblas -> sbgemm_kernel
 
+#define SBGEMM_SMALL_MATRIX_PERMIT	gotoblas -> sbgemm_small_matrix_permit
 #endif
 
+#define SBGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(sbgemm_small_kernel_nn)
+#define SBGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(sbgemm_small_kernel_nt)
+#define SBGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(sbgemm_small_kernel_tn)
+#define SBGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(sbgemm_small_kernel_tt)
+
+#define SBGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(sbgemm_small_kernel_b0_nn)
+#define SBGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(sbgemm_small_kernel_b0_nt)
+#define SBGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(sbgemm_small_kernel_b0_tn)
+#define SBGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(sbgemm_small_kernel_b0_tt)
+
 #define	SBGEMM_NN		sbgemm_nn
 #define	SBGEMM_CN		sbgemm_tn
 #define	SBGEMM_TN		sbgemm_tn
diff --git a/common_x86.h b/common_x86.h
index ec928e236..bc77eca58 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -340,7 +340,8 @@ REALNAME:
 	.align 16; \
 	.globl REALNAME ;\
        .type REALNAME, @function; \
-REALNAME:
+REALNAME: \
+	_CET_ENDBR
 
 #ifdef PROFILE
 #define PROFCODE call mcount
diff --git a/common_x86_64.h b/common_x86_64.h
index b813336c6..729a055ce 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -451,7 +451,8 @@ REALNAME:
 	.align 512; \
 	.globl REALNAME ;\
        .type REALNAME, @function; \
-REALNAME:
+REALNAME: \
+	_CET_ENDBR
 
 #ifdef PROFILE
 #define PROFCODE call *mcount@GOTPCREL(%rip)
diff --git a/common_z.h b/common_z.h
index f1e78dd08..c12d71b39 100644
--- a/common_z.h
+++ b/common_z.h
@@ -232,6 +232,8 @@
 
 #define ZGEADD_K                zgeadd_k 
 
+#define ZGEMM_SMALL_MATRIX_PERMIT	zgemm_small_matrix_permit
+
 #else
 
 #define	ZAMAX_K			gotoblas -> zamax_k
@@ -426,8 +428,51 @@
 
 #define ZGEADD_K                gotoblas -> zgeadd_k
 
+#define ZGEMM_SMALL_MATRIX_PERMIT	gotoblas -> zgemm_small_matrix_permit
+
 #endif
 
+#define ZGEMM_SMALL_KERNEL_NN		FUNC_OFFSET(zgemm_small_kernel_nn)
+#define ZGEMM_SMALL_KERNEL_NT		FUNC_OFFSET(zgemm_small_kernel_nt)
+#define ZGEMM_SMALL_KERNEL_NR		FUNC_OFFSET(zgemm_small_kernel_nr)
+#define ZGEMM_SMALL_KERNEL_NC		FUNC_OFFSET(zgemm_small_kernel_nc)
+
+#define ZGEMM_SMALL_KERNEL_TN		FUNC_OFFSET(zgemm_small_kernel_tn)
+#define ZGEMM_SMALL_KERNEL_TT		FUNC_OFFSET(zgemm_small_kernel_tt)
+#define ZGEMM_SMALL_KERNEL_TR		FUNC_OFFSET(zgemm_small_kernel_tr)
+#define ZGEMM_SMALL_KERNEL_TC		FUNC_OFFSET(zgemm_small_kernel_tc)
+
+#define ZGEMM_SMALL_KERNEL_RN		FUNC_OFFSET(zgemm_small_kernel_rn)
+#define ZGEMM_SMALL_KERNEL_RT		FUNC_OFFSET(zgemm_small_kernel_rt)
+#define ZGEMM_SMALL_KERNEL_RR		FUNC_OFFSET(zgemm_small_kernel_rr)
+#define ZGEMM_SMALL_KERNEL_RC		FUNC_OFFSET(zgemm_small_kernel_rc)
+
+#define ZGEMM_SMALL_KERNEL_CN		FUNC_OFFSET(zgemm_small_kernel_cn)
+#define ZGEMM_SMALL_KERNEL_CT		FUNC_OFFSET(zgemm_small_kernel_ct)
+#define ZGEMM_SMALL_KERNEL_CR		FUNC_OFFSET(zgemm_small_kernel_cr)
+#define ZGEMM_SMALL_KERNEL_CC		FUNC_OFFSET(zgemm_small_kernel_cc)
+
+#define ZGEMM_SMALL_KERNEL_B0_NN	FUNC_OFFSET(zgemm_small_kernel_b0_nn)
+#define ZGEMM_SMALL_KERNEL_B0_NT	FUNC_OFFSET(zgemm_small_kernel_b0_nt)
+#define ZGEMM_SMALL_KERNEL_B0_NR	FUNC_OFFSET(zgemm_small_kernel_b0_nr)
+#define ZGEMM_SMALL_KERNEL_B0_NC	FUNC_OFFSET(zgemm_small_kernel_b0_nc)
+
+#define ZGEMM_SMALL_KERNEL_B0_TN	FUNC_OFFSET(zgemm_small_kernel_b0_tn)
+#define ZGEMM_SMALL_KERNEL_B0_TT	FUNC_OFFSET(zgemm_small_kernel_b0_tt)
+#define ZGEMM_SMALL_KERNEL_B0_TR	FUNC_OFFSET(zgemm_small_kernel_b0_tr)
+#define ZGEMM_SMALL_KERNEL_B0_TC	FUNC_OFFSET(zgemm_small_kernel_b0_tc)
+
+#define ZGEMM_SMALL_KERNEL_B0_RN	FUNC_OFFSET(zgemm_small_kernel_b0_rn)
+#define ZGEMM_SMALL_KERNEL_B0_RT	FUNC_OFFSET(zgemm_small_kernel_b0_rt)
+#define ZGEMM_SMALL_KERNEL_B0_RR	FUNC_OFFSET(zgemm_small_kernel_b0_rr)
+#define ZGEMM_SMALL_KERNEL_B0_RC	FUNC_OFFSET(zgemm_small_kernel_b0_rc)
+
+#define ZGEMM_SMALL_KERNEL_B0_CN	FUNC_OFFSET(zgemm_small_kernel_b0_cn)
+#define ZGEMM_SMALL_KERNEL_B0_CT	FUNC_OFFSET(zgemm_small_kernel_b0_ct)
+#define ZGEMM_SMALL_KERNEL_B0_CR	FUNC_OFFSET(zgemm_small_kernel_b0_cr)
+#define ZGEMM_SMALL_KERNEL_B0_CC	FUNC_OFFSET(zgemm_small_kernel_b0_cc)
+
+
 #define	ZGEMM_NN		zgemm_nn
 #define	ZGEMM_CN		zgemm_cn
 #define	ZGEMM_TN		zgemm_tn
diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile
index 81e3470ef..be8313e65 100644
--- a/cpp_thread_test/Makefile
+++ b/cpp_thread_test/Makefile
@@ -1,13 +1,14 @@
-include ../Makefile.rule
+TOPDIR = ..
+include $(TOPDIR)/Makefile.system
 
 all :: dgemv_tester dgemm_tester
 
 dgemv_tester :
-	$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
+	$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
 	./dgemv_tester
 
 dgemm_tester : dgemv_tester
-	$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
+	$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
 	./dgemm_tester
 
 clean ::
diff --git a/cpuid.h b/cpuid.h
index 824e0bc70..55478893c 100644
--- a/cpuid.h
+++ b/cpuid.h
@@ -54,6 +54,7 @@
 #define VENDOR_TRANSMETA  9
 #define VENDOR_NSC	 10
 #define VENDOR_HYGON	 11
+#define VENDOR_ZHAOXIN   12
 #define VENDOR_UNKNOWN   99
 
 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -119,6 +120,7 @@
 #define CORE_SKYLAKEX    28
 #define CORE_DHYANA	 29
 #define CORE_COOPERLAKE  30
+#define CORE_SAPPHIRERAPIDS  31
 
 #define HAVE_SSE      (1 <<  0)
 #define HAVE_SSE2     (1 <<  1)
@@ -144,6 +146,7 @@
 #define HAVE_AVX512VL (1 << 21)
 #define HAVE_AVX2     (1 << 22)
 #define HAVE_AVX512BF16  (1 << 23)
+#define HAVE_AMXBF16  (1 << 24)
 
 #define CACHE_INFO_L1_I     1
 #define CACHE_INFO_L1_D     2
@@ -221,6 +224,7 @@ typedef struct {
 #define CPUTYPE_SKYLAKEX		52
 #define CPUTYPE_DHYANA			53
 #define CPUTYPE_COOPERLAKE		54
+#define CPUTYPE_SAPPHIRERAPIDS		55
 
 #define CPUTYPE_HYGON_UNKNOWN		99
 
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 5f5d7771b..cc3a82815 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -26,20 +26,25 @@
   *****************************************************************************/
 
 #include <string.h>
-#ifdef OS_DARWIN
+#ifdef __APPLE__
 #include <sys/sysctl.h>
 int32_t value;
 size_t length=sizeof(value);
+int64_t value64;
+size_t length64=sizeof(value64);
 #endif
 
 #define CPU_UNKNOWN     	0
 #define CPU_ARMV8       	1
 // Arm
 #define CPU_CORTEXA53     2
+#define CPU_CORTEXA55     14
 #define CPU_CORTEXA57     3
 #define CPU_CORTEXA72     4
 #define CPU_CORTEXA73     5
 #define CPU_NEOVERSEN1    11
+#define CPU_NEOVERSEV1    16
+#define CPU_NEOVERSEN2    17
 // Qualcomm
 #define CPU_FALKOR        6
 // Cavium
@@ -52,6 +57,8 @@ size_t length=sizeof(value);
 #define CPU_EMAG8180	 10
 // Apple
 #define CPU_VORTEX       13
+// Fujitsu
+#define CPU_A64FX	 15
 
 static char *cpuname[] = {
   "UNKNOWN",
@@ -66,8 +73,12 @@ static char *cpuname[] = {
   "TSV110",
   "EMAG8180",
   "NEOVERSEN1",
+  "NEOVERSEV1"
+  "NEOVERSEN2"
   "THUNDERX3T110",
-  "VORTEX"	
+  "VORTEX",
+  "CORTEXA55",
+  "A64FX"
 };
 
 static char *cpuname_lower[] = {
@@ -83,8 +94,12 @@ static char *cpuname_lower[] = {
   "tsv110",
   "emag8180",
   "neoversen1",
+  "neoversev1",
+  "neoversen2",
   "thunderx3t110",
-  "vortex"	
+  "vortex",
+  "cortexa55",
+  "a64fx"
 };
 
 int get_feature(char *search)
@@ -161,6 +176,12 @@ int detect(void)
         return CPU_CORTEXA73;
       else if (strstr(cpu_part, "0xd0c"))
         return CPU_NEOVERSEN1;
+      else if (strstr(cpu_part, "0xd40"))
+        return CPU_NEOVERSEV1;
+      else if (strstr(cpu_part, "0xd49"))
+        return CPU_NEOVERSEN2;
+      else if (strstr(cpu_part, "0xd05"))
+	return CPU_CORTEXA55;
     }
     // Qualcomm
     else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@@ -178,6 +199,9 @@ int detect(void)
     // Ampere
     else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
                         return CPU_EMAG8180;
+    // Fujitsu
+    else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
+                        return CPU_A64FX;
 	}
 
 	p = (char *) NULL ;
@@ -207,9 +231,9 @@ int detect(void)
 
 	}
 #else
-#ifdef DARWIN
+#ifdef __APPLE__
 	sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
-	if (value ==131287967) return CPU_VORTEX;
+	if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
 #endif
 	return CPU_ARMV8;	
 #endif
@@ -260,7 +284,7 @@ int n=0;
 
 	printf("#define NUM_CORES %d\n",n);
 #endif
-#ifdef DARWIN
+#ifdef __APPLE__
 	sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
 	printf("#define NUM_CORES %d\n",value);
 #endif	
@@ -280,153 +304,196 @@ void get_cpuconfig(void)
 	switch (d)
 	{
 
-    case CPU_CORTEXA53:
-      printf("#define %s\n", cpuname[d]);
-      // Fall-through
-		case CPU_ARMV8:
-      // Minimum parameters for ARMv8 (based on A53)
-    	printf("#define L1_DATA_SIZE 32768\n");
-    	printf("#define L1_DATA_LINESIZE 64\n");
-    	printf("#define L2_SIZE 262144\n");
-    	printf("#define L2_LINESIZE 64\n");
-    	printf("#define DTB_DEFAULT_ENTRIES 64\n");
-    	printf("#define DTB_SIZE 4096\n");
-    	printf("#define L2_ASSOCIATIVE 4\n");
+	    case CPU_CORTEXA53:
+	    case CPU_CORTEXA55:
+	        printf("#define %s\n", cpuname[d]);
+	      // Fall-through
+	    case CPU_ARMV8:
+	      // Minimum parameters for ARMv8 (based on A53)
+	    	printf("#define L1_DATA_SIZE 32768\n");
+	    	printf("#define L1_DATA_LINESIZE 64\n");
+ 	   	printf("#define L2_SIZE 262144\n");
+	    	printf("#define L2_LINESIZE 64\n");
+	    	printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ 	   	printf("#define DTB_SIZE 4096\n");
+ 	   	printf("#define L2_ASSOCIATIVE 4\n");
 			break;
 
-		case CPU_CORTEXA57:
-		case CPU_CORTEXA72:
-		case CPU_CORTEXA73:
+	    case CPU_CORTEXA57:
+	    case CPU_CORTEXA72:
+	    case CPU_CORTEXA73:
       // Common minimum settings for these Arm cores
       // Can change a lot, but we need to be conservative
       // TODO: detect info from /sys if possible
-      printf("#define %s\n", cpuname[d]);
-			printf("#define L1_CODE_SIZE 49152\n");
-			printf("#define L1_CODE_LINESIZE 64\n");
-			printf("#define L1_CODE_ASSOCIATIVE 3\n");
-			printf("#define L1_DATA_SIZE 32768\n");
-			printf("#define L1_DATA_LINESIZE 64\n");
-			printf("#define L1_DATA_ASSOCIATIVE 2\n");
-      printf("#define L2_SIZE 524288\n");
-			printf("#define L2_LINESIZE 64\n");
-			printf("#define L2_ASSOCIATIVE 16\n");
-			printf("#define DTB_DEFAULT_ENTRIES 64\n");
-			printf("#define DTB_SIZE 4096\n");
-			break;
-		case CPU_NEOVERSEN1:
-			printf("#define %s\n", cpuname[d]);
-			printf("#define L1_CODE_SIZE 65536\n");
-			printf("#define L1_CODE_LINESIZE 64\n");
-			printf("#define L1_CODE_ASSOCIATIVE 4\n");
-			printf("#define L1_DATA_SIZE 65536\n");
-			printf("#define L1_DATA_LINESIZE 64\n");
-			printf("#define L1_DATA_ASSOCIATIVE 4\n");
-			printf("#define L2_SIZE 1048576\n");
-			printf("#define L2_LINESIZE 64\n");
-			printf("#define L2_ASSOCIATIVE 16\n");
-			printf("#define DTB_DEFAULT_ENTRIES 64\n");
-			printf("#define DTB_SIZE 4096\n");
-			break;
-
-    case CPU_FALKOR:
-      printf("#define FALKOR\n");
-      printf("#define L1_CODE_SIZE 65536\n");
-      printf("#define L1_CODE_LINESIZE 64\n");
-      printf("#define L1_DATA_SIZE 32768\n");
-      printf("#define L1_DATA_LINESIZE 128\n");
-      printf("#define L2_SIZE 524288\n");
-      printf("#define L2_LINESIZE 64\n");
-      printf("#define DTB_DEFAULT_ENTRIES 64\n");
-      printf("#define DTB_SIZE 4096\n");
-      printf("#define L2_ASSOCIATIVE 16\n");
-      break;
-
-		case CPU_THUNDERX:
-			printf("#define THUNDERX\n");
-			printf("#define L1_DATA_SIZE 32768\n");
-			printf("#define L1_DATA_LINESIZE 128\n");
-			printf("#define L2_SIZE 16777216\n");
-			printf("#define L2_LINESIZE 128\n");
-			printf("#define DTB_DEFAULT_ENTRIES 64\n");
-			printf("#define DTB_SIZE 4096\n");
-			printf("#define L2_ASSOCIATIVE 16\n");
-			break;
-
-		case CPU_THUNDERX2T99:
-			printf("#define THUNDERX2T99                  \n");
-			printf("#define L1_CODE_SIZE         32768    \n");
-			printf("#define L1_CODE_LINESIZE     64       \n");
-			printf("#define L1_CODE_ASSOCIATIVE  8        \n");
-			printf("#define L1_DATA_SIZE         32768    \n");
-			printf("#define L1_DATA_LINESIZE     64       \n");
-			printf("#define L1_DATA_ASSOCIATIVE  8        \n");
-			printf("#define L2_SIZE              262144   \n");
-			printf("#define L2_LINESIZE          64       \n");
-			printf("#define L2_ASSOCIATIVE       8        \n");
-			printf("#define L3_SIZE              33554432 \n");
-			printf("#define L3_LINESIZE          64       \n");
-			printf("#define L3_ASSOCIATIVE       32       \n");
-			printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-			printf("#define DTB_SIZE             4096     \n");
-			break;
+      		printf("#define %s\n", cpuname[d]);
+		printf("#define L1_CODE_SIZE 49152\n");
+		printf("#define L1_CODE_LINESIZE 64\n");
+		printf("#define L1_CODE_ASSOCIATIVE 3\n");
+		printf("#define L1_DATA_SIZE 32768\n");
+		printf("#define L1_DATA_LINESIZE 64\n");
+		printf("#define L1_DATA_ASSOCIATIVE 2\n");
+		printf("#define L2_SIZE 524288\n");
+		printf("#define L2_LINESIZE 64\n");
+		printf("#define L2_ASSOCIATIVE 16\n");
+		printf("#define DTB_DEFAULT_ENTRIES 64\n");
+		printf("#define DTB_SIZE 4096\n");
+		break;
+	    case CPU_NEOVERSEN1:
+		printf("#define %s\n", cpuname[d]);
+		printf("#define L1_CODE_SIZE 65536\n");
+		printf("#define L1_CODE_LINESIZE 64\n");
+		printf("#define L1_CODE_ASSOCIATIVE 4\n");
+		printf("#define L1_DATA_SIZE 65536\n");
+		printf("#define L1_DATA_LINESIZE 64\n");
+		printf("#define L1_DATA_ASSOCIATIVE 4\n");
+		printf("#define L2_SIZE 1048576\n");
+		printf("#define L2_LINESIZE 64\n");
+		printf("#define L2_ASSOCIATIVE 8\n");
+		printf("#define DTB_DEFAULT_ENTRIES 48\n");
+		printf("#define DTB_SIZE 4096\n");
+		break;
+
+	    case CPU_NEOVERSEV1:
+                printf("#define %s\n", cpuname[d]);
+                printf("#define L1_CODE_SIZE 65536\n");
+                printf("#define L1_CODE_LINESIZE 64\n");
+                printf("#define L1_CODE_ASSOCIATIVE 4\n");
+                printf("#define L1_DATA_SIZE 65536\n");
+                printf("#define L1_DATA_LINESIZE 64\n");
+                printf("#define L1_DATA_ASSOCIATIVE 4\n");
+                printf("#define L2_SIZE 1048576\n");
+                printf("#define L2_LINESIZE 64\n");
+                printf("#define L2_ASSOCIATIVE 8\n");
+                printf("#define DTB_DEFAULT_ENTRIES 48\n");
+                printf("#define DTB_SIZE 4096\n");
+                break;
+
+	    case CPU_NEOVERSEN2:
+                printf("#define %s\n", cpuname[d]);
+                printf("#define L1_CODE_SIZE 65536\n");
+                printf("#define L1_CODE_LINESIZE 64\n");
+                printf("#define L1_CODE_ASSOCIATIVE 4\n");
+                printf("#define L1_DATA_SIZE 65536\n");
+                printf("#define L1_DATA_LINESIZE 64\n");
+                printf("#define L1_DATA_ASSOCIATIVE 4\n");
+                printf("#define L2_SIZE 1048576\n");
+                printf("#define L2_LINESIZE 64\n");
+                printf("#define L2_ASSOCIATIVE 8\n");
+                printf("#define DTB_DEFAULT_ENTRIES 48\n");
+                printf("#define DTB_SIZE 4096\n");
+                break;
+
+	    case CPU_FALKOR:
+	        printf("#define FALKOR\n");
+	        printf("#define L1_CODE_SIZE 65536\n");
+	        printf("#define L1_CODE_LINESIZE 64\n");
+	        printf("#define L1_DATA_SIZE 32768\n");
+	        printf("#define L1_DATA_LINESIZE 128\n");
+	        printf("#define L2_SIZE 524288\n");
+	        printf("#define L2_LINESIZE 64\n");
+	        printf("#define DTB_DEFAULT_ENTRIES 64\n");
+	        printf("#define DTB_SIZE 4096\n");
+	        printf("#define L2_ASSOCIATIVE 16\n");
+	        break;
+
+	    case CPU_THUNDERX:
+		printf("#define THUNDERX\n");
+		printf("#define L1_DATA_SIZE 32768\n");
+		printf("#define L1_DATA_LINESIZE 128\n");
+		printf("#define L2_SIZE 16777216\n");
+		printf("#define L2_LINESIZE 128\n");
+		printf("#define DTB_DEFAULT_ENTRIES 64\n");
+		printf("#define DTB_SIZE 4096\n");
+		printf("#define L2_ASSOCIATIVE 16\n");
+		break;
+
+	    case CPU_THUNDERX2T99:
+		printf("#define THUNDERX2T99                  \n");
+		printf("#define L1_CODE_SIZE         32768    \n");
+		printf("#define L1_CODE_LINESIZE     64       \n");
+		printf("#define L1_CODE_ASSOCIATIVE  8        \n");
+		printf("#define L1_DATA_SIZE         32768    \n");
+		printf("#define L1_DATA_LINESIZE     64       \n");
+		printf("#define L1_DATA_ASSOCIATIVE  8        \n");
+		printf("#define L2_SIZE              262144   \n");
+		printf("#define L2_LINESIZE          64       \n");
+		printf("#define L2_ASSOCIATIVE       8        \n");
+		printf("#define L3_SIZE              33554432 \n");
+		printf("#define L3_LINESIZE          64       \n");
+		printf("#define L3_ASSOCIATIVE       32       \n");
+		printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+		printf("#define DTB_SIZE             4096     \n");
+		break;
 			
-		case CPU_TSV110:
-			printf("#define TSV110                        \n");
-			printf("#define L1_CODE_SIZE         65536    \n");
-			printf("#define L1_CODE_LINESIZE     64       \n");
-			printf("#define L1_CODE_ASSOCIATIVE  4        \n");
-			printf("#define L1_DATA_SIZE         65536    \n");
-			printf("#define L1_DATA_LINESIZE     64       \n");
-			printf("#define L1_DATA_ASSOCIATIVE  4        \n");
-			printf("#define L2_SIZE              524228   \n");
-			printf("#define L2_LINESIZE          64       \n");
-			printf("#define L2_ASSOCIATIVE       8        \n");
-			printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-			printf("#define DTB_SIZE             4096     \n");
-			break;	
-
-		case CPU_EMAG8180:
-      // Minimum parameters for ARMv8 (based on A53)
-	printf("#define EMAG8180\n");
-    	printf("#define L1_CODE_SIZE 32768\n");
-    	printf("#define L1_DATA_SIZE 32768\n");
-    	printf("#define L1_DATA_LINESIZE 64\n");
-    	printf("#define L2_SIZE 262144\n");
-    	printf("#define L2_LINESIZE 64\n");
-    	printf("#define DTB_DEFAULT_ENTRIES 64\n");
-    	printf("#define DTB_SIZE 4096\n");
-			break;
-
-		case CPU_THUNDERX3T110:
-			printf("#define THUNDERX3T110                 \n");
-			printf("#define L1_CODE_SIZE         65536    \n");
-			printf("#define L1_CODE_LINESIZE     64       \n");
-			printf("#define L1_CODE_ASSOCIATIVE  8        \n");
-			printf("#define L1_DATA_SIZE         32768    \n");
-			printf("#define L1_DATA_LINESIZE     64       \n");
-			printf("#define L1_DATA_ASSOCIATIVE  8        \n");
-			printf("#define L2_SIZE              524288   \n");
-			printf("#define L2_LINESIZE          64       \n");
-			printf("#define L2_ASSOCIATIVE       8        \n");
-			printf("#define L3_SIZE              94371840 \n");
-			printf("#define L3_LINESIZE          64       \n");
-			printf("#define L3_ASSOCIATIVE       32       \n");
-			printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-			printf("#define DTB_SIZE             4096     \n");
-			break;
-#ifdef DARWIN
-		case CPU_VORTEX:
-			printf("#define VORTEX			      \n");
-			sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
-			printf("#define L1_CODE_SIZE	     %d       \n",value);
-			sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
-			printf("#define L1_CODE_LINESIZE     %d       \n",value);
-			sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
-			printf("#define L1_DATA_SIZE	     %d       \n",value);
-			sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
-			printf("#define L2_SIZE	     %d       \n",value);
-			break;
+	    case CPU_TSV110:
+		printf("#define TSV110                        \n");
+		printf("#define L1_CODE_SIZE         65536    \n");
+		printf("#define L1_CODE_LINESIZE     64       \n");
+		printf("#define L1_CODE_ASSOCIATIVE  4        \n");
+		printf("#define L1_DATA_SIZE         65536    \n");
+		printf("#define L1_DATA_LINESIZE     64       \n");
+		printf("#define L1_DATA_ASSOCIATIVE  4        \n");
+		printf("#define L2_SIZE              524228   \n");
+		printf("#define L2_LINESIZE          64       \n");
+		printf("#define L2_ASSOCIATIVE       8        \n");
+		printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+		printf("#define DTB_SIZE             4096     \n");
+		break;	
+
+	    case CPU_EMAG8180:
+     		 // Minimum parameters for ARMv8 (based on A53)
+		printf("#define EMAG8180\n");
+    		printf("#define L1_CODE_SIZE 32768\n");
+    		printf("#define L1_DATA_SIZE 32768\n");
+    		printf("#define L1_DATA_LINESIZE 64\n");
+    		printf("#define L2_SIZE 262144\n");
+    		printf("#define L2_LINESIZE 64\n");
+	    	printf("#define DTB_DEFAULT_ENTRIES 64\n");
+	    	printf("#define DTB_SIZE 4096\n");
+		break;
+
+	    case CPU_THUNDERX3T110:
+		printf("#define THUNDERX3T110                 \n");
+		printf("#define L1_CODE_SIZE         65536    \n");
+		printf("#define L1_CODE_LINESIZE     64       \n");
+		printf("#define L1_CODE_ASSOCIATIVE  8        \n");
+		printf("#define L1_DATA_SIZE         32768    \n");
+		printf("#define L1_DATA_LINESIZE     64       \n");
+		printf("#define L1_DATA_ASSOCIATIVE  8        \n");
+		printf("#define L2_SIZE              524288   \n");
+		printf("#define L2_LINESIZE          64       \n");
+		printf("#define L2_ASSOCIATIVE       8        \n");
+		printf("#define L3_SIZE              94371840 \n");
+		printf("#define L3_LINESIZE          64       \n");
+		printf("#define L3_ASSOCIATIVE       32       \n");
+		printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+		printf("#define DTB_SIZE             4096     \n");
+		break;
+#ifdef __APPLE__
+	    case CPU_VORTEX:
+		printf("#define VORTEX			      \n");
+		sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
+		printf("#define L1_CODE_SIZE	     %lld       \n",value64);
+		sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
+		printf("#define L1_CODE_LINESIZE     %lld       \n",value64);
+		sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
+		printf("#define L1_DATA_SIZE	     %lld       \n",value64);
+		sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
+		printf("#define L2_SIZE	     %lld       \n",value64);
+		printf("#define DTB_DEFAULT_ENTRIES  64       \n");
+		printf("#define DTB_SIZE             4096     \n");
+		break;
 #endif			
+	    case CPU_A64FX:
+		printf("#define A64FX\n");
+    		printf("#define L1_CODE_SIZE 65535\n");
+    		printf("#define L1_DATA_SIZE 65535\n");
+    		printf("#define L1_DATA_LINESIZE 256\n");
+    		printf("#define L2_SIZE 8388608\n");
+    		printf("#define L2_LINESIZE 256\n");
+	    	printf("#define DTB_DEFAULT_ENTRIES 64\n");
+	    	printf("#define DTB_SIZE 4096\n");
+		break;
 	}
 	get_cpucount();
 }
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
new file mode 100644
index 000000000..79b186bf1
--- /dev/null
+++ b/cpuid_loongarch64.c
@@ -0,0 +1,110 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include <stdint.h>
+
+#define CPU_UNKNOWN     0
+#define CPU_LOONGSON3R5 1
+
+#define LOONGARCH_CFG2  0x02
+#define LOONGARCH_LASX  1<<7
+
+static char *cpuname[] = {
+  "UNKNOWN",
+  "LOONGSON3R5"
+};
+
+int detect(void) {
+    uint32_t reg = 0;
+
+    __asm__ volatile (
+        "cpucfg %0, %1 \n\t"
+        : "+&r"(reg)
+        : "r"(LOONGARCH_CFG2)
+    );
+
+    if (reg & LOONGARCH_LASX)
+        return CPU_LOONGSON3R5;
+    else
+        return CPU_UNKNOWN;
+}
+
+char *get_corename(void) {
+  return cpuname[detect()];
+}
+
+void get_architecture(void) {
+  printf("LOONGARCH64");
+}
+
+void get_subarchitecture(void) {
+  if (detect() == CPU_LOONGSON3R5) {
+    printf("LOONGSON3R5");
+  } else {
+    printf("UNKNOWN");
+  }
+}
+
+void get_subdirname(void) {
+  printf("loongarch64");
+}
+
+void get_cpuconfig(void) {
+  if (detect() == CPU_LOONGSON3R5) {
+    printf("#define LOONGSON3R5\n");
+    printf("#define L1_DATA_SIZE 65536\n");
+    printf("#define L1_DATA_LINESIZE 64\n");
+    printf("#define L2_SIZE 1048576\n");
+    printf("#define L2_LINESIZE 64\n");
+    printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    printf("#define DTB_SIZE 4096\n");
+    printf("#define L2_ASSOCIATIVE 16\n");
+  } else {
+    printf("#define LOONGSON3R5\n");
+    printf("#define L1_DATA_SIZE 65536\n");
+    printf("#define L1_DATA_LINESIZE 64\n");
+    printf("#define L2_SIZE 1048576\n");
+    printf("#define L2_LINESIZE 64\n");
+    printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    printf("#define DTB_SIZE 4096\n");
+    printf("#define L2_ASSOCIATIVE 16\n");
+  }
+}
+
+void get_libname(void){
+  if (detect() == CPU_LOONGSON3R5) {
+    printf("loongson3r5\n");
+  } else {
+    printf("loongarch64\n");
+  }
+}
diff --git a/cpuid_mips.c b/cpuid_mips.c
index e6e837f73..d787e7120 100644
--- a/cpuid_mips.c
+++ b/cpuid_mips.c
@@ -165,6 +165,7 @@ void get_cpuconfig(void){
   }else{
     printf("#define UNKNOWN\n");
   }
+  if (!get_feature("msa")) printf("#define NO_MSA\n");
 }
 
 void get_libname(void){
@@ -178,3 +179,38 @@ void get_libname(void){
     printf("mips\n");
   }
 }
+
+int get_feature(char *search)
+{
+
+#ifdef __linux
+        FILE *infile;
+        char buffer[2048], *p,*t;
+        p = (char *) NULL ;
+
+        infile = fopen("/proc/cpuinfo", "r");
+
+        while (fgets(buffer, sizeof(buffer), infile))
+        {
+
+                if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
+                {
+                        p = strchr(buffer, ':') + 2;
+                        break;
+                }
+        }
+
+        fclose(infile);
+
+        if( p == NULL ) return 0;
+
+        t = strtok(p," ");
+        while( t = strtok(NULL," "))
+        {
+                if (strstr(t, search))   { return(1); }
+        }
+
+#endif
+        return(0);
+}
+
diff --git a/cpuid_mips64.c b/cpuid_mips64.c
index 0c19ac1e7..8753ee3f0 100644
--- a/cpuid_mips64.c
+++ b/cpuid_mips64.c
@@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#define CPU_UNKNOWN     0
-#define CPU_SICORTEX    1
-#define CPU_LOONGSON3A  2
-#define CPU_LOONGSON3B  3
-#define CPU_I6400       4
-#define CPU_P6600       5
-#define CPU_I6500       6
+#define CPU_UNKNOWN      0
+#define CPU_SICORTEX     1
+#define CPU_LOONGSON3R3  2
+#define CPU_LOONGSON3R4  3
+#define CPU_I6400        4
+#define CPU_P6600        5
+#define CPU_I6500        6
 
 static char *cpuname[] = {
   "UNKNOWN",
   "SICORTEX",
-  "LOONGSON3A",
-  "LOONGSON3B",
+  "LOONGSON3R3",
+  "LOONGSON3R4",
   "I6400",
   "P6600",
   "I6500"
@@ -90,48 +90,13 @@ static char *cpuname[] = {
 
 int detect(void){
 
-#ifdef __linux
+#ifdef linux
   FILE *infile;
   char buffer[512], *p;
 
   p = (char *)NULL;
-  infile = fopen("/proc/cpuinfo", "r");
-  while (fgets(buffer, sizeof(buffer), infile)){
-    if (!strncmp("cpu", buffer, 3)){
-	p = strchr(buffer, ':') + 2;
-#if 0
-	fprintf(stderr, "%s\n", p);
-#endif
-	break;
-      }
-  }
-
-  fclose(infile);
-
-  if(p != NULL){
-  if (strstr(p, "Loongson-3A")){
-    return CPU_LOONGSON3A;
-  }else if(strstr(p, "Loongson-3B")){
-    return CPU_LOONGSON3B;
-  }else if (strstr(p, "Loongson-3")){
-    infile = fopen("/proc/cpuinfo", "r");
-    p = (char *)NULL;
-    while (fgets(buffer, sizeof(buffer), infile)){
-      if (!strncmp("system type", buffer, 11)){
-	p = strchr(buffer, ':') + 2;
-	break;
-      }
-    }
-    fclose(infile);
-    if (strstr(p, "loongson3a"))
-      return CPU_LOONGSON3A;
-  }else{
-    return CPU_SICORTEX;
-  }
-  }
   //Check model name for Loongson3
   infile = fopen("/proc/cpuinfo", "r");
-  p = (char *)NULL;
   while (fgets(buffer, sizeof(buffer), infile)){
     if (!strncmp("model name", buffer, 10)){
       p = strchr(buffer, ':') + 2;
@@ -139,12 +104,14 @@ int detect(void){
     }
   }
   fclose(infile);
-  if(p != NULL){
-  if (strstr(p, "Loongson-3A")){
-    return CPU_LOONGSON3A;
-  }else if(strstr(p, "Loongson-3B")){
-    return CPU_LOONGSON3B;
-  }
+  if (p != NULL){
+    if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
+      return CPU_LOONGSON3R3;
+    } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
+      return CPU_LOONGSON3R4;
+    } else{
+      return CPU_SICORTEX;
+    }
   }
 #endif
     return CPU_UNKNOWN;
@@ -159,10 +126,10 @@ void get_architecture(void){
 }
 
 void get_subarchitecture(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("LOONGSON3A");
-  }else if(detect()==CPU_LOONGSON3B){
-    printf("LOONGSON3B");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("LOONGSON3R3");
+  }else if(detect()==CPU_LOONGSON3R4){
+    printf("LOONGSON3R4");
   }else if(detect()==CPU_I6400){
     printf("I6400");
   }else if(detect()==CPU_P6600){
@@ -179,8 +146,8 @@ void get_subdirname(void){
 }
 
 void get_cpuconfig(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("#define LOONGSON3A\n");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("#define LOONGSON3R3\n");
     printf("#define L1_DATA_SIZE 65536\n");
     printf("#define L1_DATA_LINESIZE 32\n");
     printf("#define L2_SIZE 512488\n");
@@ -188,8 +155,8 @@ void get_cpuconfig(void){
     printf("#define DTB_DEFAULT_ENTRIES 64\n");
     printf("#define DTB_SIZE 4096\n");
     printf("#define L2_ASSOCIATIVE 4\n");
-  }else if(detect()==CPU_LOONGSON3B){
-    printf("#define LOONGSON3B\n");
+  }else if(detect()==CPU_LOONGSON3R4){
+    printf("#define LOONGSON3R4\n");
     printf("#define L1_DATA_SIZE 65536\n");
     printf("#define L1_DATA_LINESIZE 32\n");
     printf("#define L2_SIZE 512488\n");
@@ -234,13 +201,14 @@ void get_cpuconfig(void){
     printf("#define DTB_SIZE 4096\n");
     printf("#define L2_ASSOCIATIVE 8\n");
   }
+  if (!get_feature("msa")) printf("#define NO_MSA\n");
 }
 
 void get_libname(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("loongson3a\n");
-  }else if(detect()==CPU_LOONGSON3B) {
-    printf("loongson3b\n");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("loongson3r3\n");
+  }else if(detect()==CPU_LOONGSON3R4) {
+    printf("loongson3r4\n");
   }else if(detect()==CPU_I6400) {
     printf("i6400\n");
   }else if(detect()==CPU_P6600) {
@@ -251,3 +219,38 @@ void get_libname(void){
     printf("mips64\n");
   }
 }
+
+int get_feature(char *search)
+{
+
+#ifdef __linux
+        FILE *infile;
+        char buffer[2048], *p,*t;
+        p = (char *) NULL ;
+
+        infile = fopen("/proc/cpuinfo", "r");
+
+        while (fgets(buffer, sizeof(buffer), infile))
+        {
+
+                if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
+                {
+                        p = strchr(buffer, ':') + 2;
+                        break;
+                }
+        }
+
+        fclose(infile);
+
+        if( p == NULL ) return 0;
+
+        t = strtok(p," ");
+        while( t = strtok(NULL," "))
+        {
+                if (strstr(t, search))   { return(1); }
+        }
+
+#endif
+        return(0);
+}
+
diff --git a/cpuid_x86.c b/cpuid_x86.c
index 84c12ff43..d7d85eb20 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1,3 +1,4 @@
+//{
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
@@ -266,6 +267,31 @@ int support_avx512_bf16(){
 #endif
 }
 
+#define BIT_AMX_TILE	0x01000000
+#define BIT_AMX_BF16	0x00400000
+#define BIT_AMX_ENBD	0x00060000
+
+int support_amx_bf16() {
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx512())
+    return 0;
+  // CPUID.7.0:EDX indicates AMX support
+  cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+  if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
+    // CPUID.D.0:EAX[17:18] indicates AMX enabled
+    cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+    if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
+      ret = 1;
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 int get_vendor(void){
   int eax, ebx, ecx, edx;
   char vendor[13];
@@ -283,6 +309,7 @@ int get_vendor(void){
   if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX;
   if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN;
   if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR;
+  if (!strcmp(vendor, "  Shanghai  ")) return VENDOR_ZHAOXIN;
   if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE;
   if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
   if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
@@ -296,9 +323,11 @@ int get_vendor(void){
 
 int get_cputype(int gettype){
   int eax, ebx, ecx, edx;
+/*
   int extend_family, family;
   int extend_model, model;
   int type, stepping;
+*/
   int feature = 0;
 
   cpuid(1, &eax, &ebx, &ecx, &edx);
@@ -352,6 +381,7 @@ int get_cputype(int gettype){
     if (support_avx2()) feature |= HAVE_AVX2;
     if (support_avx512()) feature |= HAVE_AVX512VL;
     if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
+    if (support_amx_bf16()) feature |= HAVE_AMXBF16;
     if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
 #endif
 
@@ -400,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
   cpuid(0, &cpuid_level, &ebx, &ecx, &edx);
 
   if (cpuid_level > 1) {
-    int numcalls =0 ;
+    int numcalls;
+    
     cpuid(2, &eax, &ebx, &ecx, &edx);
     numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries
     info[ 0] = BITMASK(eax,  8, 0xff);
@@ -1066,7 +1097,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
 
   if ((get_vendor() == VENDOR_AMD) ||
       (get_vendor() == VENDOR_HYGON) ||
-      (get_vendor() == VENDOR_CENTAUR)) {
+      (get_vendor() == VENDOR_CENTAUR) ||
+      (get_vendor() == VENDOR_ZHAOXIN)) {
     cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
 
     LDTB.size        = 4096;
@@ -1189,7 +1221,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
 
 int get_cpuname(void){
 
-  int family, exfamily, model, vendor, exmodel;
+  int family, exfamily, model, vendor, exmodel, stepping;
 
   if (!have_cpuid()) return CPUTYPE_80386;
 
@@ -1197,6 +1229,7 @@ int get_cpuname(void){
   exfamily = get_cputype(GET_EXFAMILY);
   model    = get_cputype(GET_MODEL);
   exmodel  = get_cputype(GET_EXMODEL);
+  stepping = get_cputype(GET_STEPPING);
 
   vendor = get_vendor();
 
@@ -1398,6 +1431,17 @@ int get_cpuname(void){
 	    return CPUTYPE_SANDYBRIDGE;
 	  else
 	  return CPUTYPE_NEHALEM;
+	case 10: // Ice Lake SP
+	  if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;	
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
+	  else
+	  return CPUTYPE_NEHALEM;	
         }
       break;
       case 7: // family 6 exmodel 7
@@ -1415,9 +1459,18 @@ int get_cpuname(void){
 	  return CPUTYPE_NEHALEM;
         }
       break;
-      case 9:
       case 8:      
         switch (model) {
+        case 12: // Tiger Lake
+        case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz)
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+            return CPUTYPE_SANDYBRIDGE;
+          else
+          return CPUTYPE_NEHALEM;
 	case 14: // Kaby Lake and refreshes
           if(support_avx2())
             return CPUTYPE_HASWELL;
@@ -1425,21 +1478,74 @@ int get_cpuname(void){
 	    return CPUTYPE_SANDYBRIDGE;
           else
 	    return CPUTYPE_NEHALEM;
-    }
-      case 10: //family 6 exmodel 10
+	case 15: // Sapphire Rapids
+	  if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;	
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
+	  else
+	  return CPUTYPE_NEHALEM;	
+        }
+      break;
+      case 9:
         switch (model) {
-    case 5: // Comet Lake H and S
-    case 6: // Comet Lake U
+        case 7: // Alder Lake desktop
+        case 10: // Alder Lake mobile
+	  if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;	
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
           if(support_avx2())
             return CPUTYPE_HASWELL;
           if(support_avx())
-        return CPUTYPE_SANDYBRIDGE;
+            return CPUTYPE_SANDYBRIDGE;
           else
-        return CPUTYPE_NEHALEM;
-	}
-	break;    
-      }
+          return CPUTYPE_NEHALEM;
+        case 13: // Ice Lake NNPI
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+            return CPUTYPE_SANDYBRIDGE;
+          else
+          return CPUTYPE_NEHALEM;
+	case 14: // Kaby Lake and refreshes
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
+          else
+	    return CPUTYPE_NEHALEM;
+        }
       break;
+      case 10: //family 6 exmodel 10
+        switch (model) {
+          case 5: // Comet Lake H and S
+          case 6: // Comet Lake U
+            if(support_avx2())
+              return CPUTYPE_HASWELL;
+            if(support_avx())
+              return CPUTYPE_SANDYBRIDGE;
+            else
+              return CPUTYPE_NEHALEM;
+          case 7: // Rocket Lake           
+	    if(support_avx512())
+              return CPUTYPE_SKYLAKEX;
+            if(support_avx2())
+              return CPUTYPE_HASWELL;
+            if(support_avx())
+	      return CPUTYPE_SANDYBRIDGE;
+	    else
+	    return CPUTYPE_NEHALEM;
+        }
+        break;
+      }
+      break;    
     case 0x7:
       return CPUTYPE_ITANIUM;
     case 0xf:
@@ -1538,7 +1644,6 @@ int get_cpuname(void){
 	  else
 	    return CPUTYPE_BARCELONA;
         }
-	break;	      
       case 10: // Zen3		      
 	if(support_avx())
 #ifndef NO_AVX2
@@ -1598,13 +1703,20 @@ int get_cpuname(void){
     switch (family) {
     case 0x5:
       return CPUTYPE_CENTAURC6;
-      break;
     case 0x6:
-      return CPUTYPE_NANO;
-      break;
-
+      if (model == 0xf && stepping < 0xe)
+        return CPUTYPE_NANO;
+      return CPUTYPE_NEHALEM;
+    default:
+      if (family >= 0x7)
+        return CPUTYPE_NEHALEM;
+      else
+        return CPUTYPE_VIAC3;
     }
-    return CPUTYPE_VIAC3;
+  }
+
+  if (vendor == VENDOR_ZHAOXIN){
+    return CPUTYPE_NEHALEM;
   }
 
   if (vendor == VENDOR_RISE){
@@ -1837,7 +1949,7 @@ char *get_lower_cpunamechar(void){
 
 int get_coretype(void){
 
-  int family, exfamily, model, exmodel, vendor;
+  int family, exfamily, model, exmodel, vendor, stepping;
 
   if (!have_cpuid()) return CORE_80486;
 
@@ -1845,6 +1957,7 @@ int get_coretype(void){
   exfamily = get_cputype(GET_EXFAMILY);
   model    = get_cputype(GET_MODEL);
   exmodel  = get_cputype(GET_EXMODEL);
+  stepping = get_cputype(GET_STEPPING);
 
   vendor = get_vendor();
 
@@ -2002,19 +2115,7 @@ int get_coretype(void){
 	    return CORE_NEHALEM;
         }
         break;
-      case 10:
-        switch (model) {
-	  case 5: // Comet Lake H and S
-    	  case 6: // Comet Lake U
-            if(support_avx())
-  #ifndef NO_AVX2
-              return CORE_HASWELL;
-  #else
-              return CORE_SANDYBRIDGE;
-  #endif
-            else
-              return CORE_NEHALEM;
-        }
+
       case 5:
         switch (model) {
 	case 6:
@@ -2068,6 +2169,7 @@ int get_coretype(void){
 	    return CORE_NEHALEM;
         }
 	break;
+
       case 6:
         if (model == 6)
 #ifndef NO_AVX512
@@ -2081,12 +2183,27 @@ int get_coretype(void){
 #endif
 	  else
 	    return CORE_NEHALEM;
-#endif			
-        break;    	
+#endif
+	if (model == 10 || model == 12)
+#ifndef NO_AVX512
+	  if(support_avx512_bf16())
+            return CORE_COOPERLAKE;
+	  return CORE_SKYLAKEX;
+#else
+	  if(support_avx())
+#ifndef NO_AVX2
+	    return CORE_HASWELL;
+#else
+	    return CORE_SANDYBRIDGE;
+#endif
+	  else
+	    return CORE_NEHALEM;
+#endif	
+
       case 7:
         if (model == 10) 
             return CORE_NEHALEM;
-        if (model == 14)
+        if (model == 13 || model == 14) // Ice Lake
 #ifndef NO_AVX512
 	    return CORE_SKYLAKEX;
 #else
@@ -2100,9 +2217,19 @@ int get_coretype(void){
 	    return CORE_NEHALEM;
 #endif			
         break;    	
-      case 9:
+
       case 8:
-        if (model == 14) { // Kaby Lake 
+        if (model == 12 || model == 13) { // Tiger Lake
+          if(support_avx512())
+            return CORE_SKYLAKEX;
+          if(support_avx2())
+            return CORE_HASWELL;
+          if(support_avx())
+            return CORE_SANDYBRIDGE;
+          else
+          return CORE_NEHALEM;
+        }
+        if (model == 14) { // Kaby Lake mobile
 	  if(support_avx())
 #ifndef NO_AVX2
 	    return CORE_HASWELL;
@@ -2112,12 +2239,82 @@ int get_coretype(void){
 	  else
             return CORE_NEHALEM;
 	}
-      }
+        if (model == 15) { // Sapphire Rapids
+	  if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;	
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
+	  else
+	  return CPUTYPE_NEHALEM;	
+        }
       break;
 
+      case 9:
+        if (model == 7 || model == 10) { // Alder Lake
+          if(support_avx2())
+            return CORE_HASWELL;
+          if(support_avx())
+            return CORE_SANDYBRIDGE;
+          else
+          return CORE_NEHALEM;
+        }
+        if (model == 13) { // Ice Lake NNPI
+          if(support_avx512())
+            return CORE_SKYLAKEX;
+          if(support_avx2())
+            return CORE_HASWELL;
+          if(support_avx())
+            return CORE_SANDYBRIDGE;
+          else
+          return CORE_NEHALEM;
+        }
+        if (model == 14) { // Kaby Lake desktop
+	  if(support_avx())
+#ifndef NO_AVX2
+	    return CORE_HASWELL;
+#else
+	    return CORE_SANDYBRIDGE;
+#endif
+	  else
+            return CORE_NEHALEM;
+	}
+      break;
+
+      case 10:
+        switch (model) {
+	  case 5: // Comet Lake H and S
+    	  case 6: // Comet Lake U
+            if(support_avx())
+  #ifndef NO_AVX2
+              return CORE_HASWELL;
+  #else
+              return CORE_SANDYBRIDGE;
+  #endif
+            else
+              return CORE_NEHALEM;
+	  case 7:// Rocket Lake
+#ifndef NO_AVX512
+	  if(support_avx512())
+            return CORE_SKYLAKEX;
+#endif
+#ifndef NO_AVX2
+	  if(support_avx2())
+            return CORE_HASWELL;
+#endif
+	  if(support_avx())
+	    return CORE_SANDYBRIDGE;
+	  else
+	  return CORE_NEHALEM;
+        }
+
       case 15:
 	if (model <= 0x2) return CORE_NORTHWOOD;
 	else return CORE_PRESCOTT;
+      }
     }
   }
 
@@ -2216,10 +2413,19 @@ int get_coretype(void){
   if (vendor == VENDOR_CENTAUR) {
     switch (family) {
     case 0x6:
-      return CORE_NANO;
-      break;
+      if (model == 0xf && stepping < 0xe)
+        return CORE_NANO;
+      return CORE_NEHALEM;
+    default:
+      if (family >= 0x7)
+        return CORE_NEHALEM;
+      else
+        return CORE_VIAC3;
     }
-    return CORE_VIAC3;
+  }
+
+  if (vendor == VENDOR_ZHAOXIN) {
+     return CORE_NEHALEM;
   }
 
   return CORE_UNKNOWN;
@@ -2302,6 +2508,7 @@ void get_cpuconfig(void){
     if (features & HAVE_AVX2 )    printf("#define HAVE_AVX2\n");
     if (features & HAVE_AVX512VL )    printf("#define HAVE_AVX512VL\n");
     if (features & HAVE_AVX512BF16 )    printf("#define HAVE_AVX512BF16\n");
+    if (features & HAVE_AMXBF16 )    printf("#define HAVE_AMXBF16\n");
     if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
     if (features & HAVE_3DNOW)   printf("#define HAVE_3DNOW\n");
     if (features & HAVE_FMA4 )    printf("#define HAVE_FMA4\n");
@@ -2373,9 +2580,11 @@ void get_sse(void){
   if (features & HAVE_AVX2 )    printf("HAVE_AVX2=1\n");
   if (features & HAVE_AVX512VL )    printf("HAVE_AVX512VL=1\n");
   if (features & HAVE_AVX512BF16 )    printf("HAVE_AVX512BF16=1\n");
+  if (features & HAVE_AMXBF16 )    printf("HAVE_AMXBF16=1\n");
   if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
   if (features & HAVE_3DNOW)   printf("HAVE_3DNOW=1\n");
   if (features & HAVE_FMA4 )    printf("HAVE_FMA4=1\n");
   if (features & HAVE_FMA3 )    printf("HAVE_FMA3=1\n");
 
 }
+//}
diff --git a/cpuid_zarch.c b/cpuid_zarch.c
index df3b7898f..a6b953dd9 100644
--- a/cpuid_zarch.c
+++ b/cpuid_zarch.c
@@ -27,57 +27,11 @@
 
 #include <string.h>
 
-#define CPU_GENERIC     0
-#define CPU_Z13         1
-#define CPU_Z14         2
-#define CPU_Z15         3
+#include "cpuid_zarch.h"
 
-static char *cpuname[] = {
-  "ZARCH_GENERIC",
-  "Z13",
-  "Z14",
-  "Z15"
-};
-
-static char *cpuname_lower[] = {
-  "zarch_generic",
-  "z13",
-  "z14",
-  "z15"
-};
-
-int detect(void)
-{
-  FILE *infile;
-  char buffer[512], *p;
-
-  p = (char *)NULL;
-  infile = fopen("/proc/sysinfo", "r");
-  while (fgets(buffer, sizeof(buffer), infile)){
-    if (!strncmp("Type", buffer, 4)){
-        p = strchr(buffer, ':') + 2;
-#if 0
-        fprintf(stderr, "%s\n", p);
-#endif
-        break;
-      }
-  }
-
-  fclose(infile);
-
-  if (strstr(p, "2964")) return CPU_Z13;
-  if (strstr(p, "2965")) return CPU_Z13;
-  if (strstr(p, "3906")) return CPU_Z14;
-  if (strstr(p, "3907")) return CPU_Z14;
-  if (strstr(p, "8561")) return CPU_Z14;        // fallback z15 to z14
-  if (strstr(p, "8562")) return CPU_Z14;        // fallback z15 to z14
-
-  return CPU_GENERIC;
-}
 
 void get_libname(void)
 {
-
 	int d = detect();
 	printf("%s", cpuname_lower[d]);
 }
diff --git a/cpuid_zarch.h b/cpuid_zarch.h
new file mode 100644
index 000000000..686f2eb17
--- /dev/null
+++ b/cpuid_zarch.h
@@ -0,0 +1,101 @@
+#include <stdlib.h>
+
+#define CPU_GENERIC     0
+#define CPU_Z13         1
+#define CPU_Z14         2
+#define CPU_Z15         3
+
+static char *cpuname[] = {
+  "ZARCH_GENERIC",
+  "Z13",
+  "Z14",
+  "Z15"
+};
+
+static char *cpuname_lower[] = {
+  "zarch_generic",
+  "z13",
+  "z14",
+  "z15"
+};
+
+// Guard the use of getauxval() on glibc version >= 2.16
+#ifdef __GLIBC__
+#include <features.h>
+#if __GLIBC_PREREQ(2, 16)
+#include <sys/auxv.h>
+#define HAVE_GETAUXVAL 1
+
+static unsigned long get_hwcap(void)
+{
+	unsigned long hwcap = getauxval(AT_HWCAP);
+	char *maskenv;
+
+	// honor requests for not using specific CPU features in LD_HWCAP_MASK
+	maskenv = getenv("LD_HWCAP_MASK");
+	if (maskenv)
+		hwcap &= strtoul(maskenv, NULL, 0);
+
+	return hwcap;
+	// note that a missing auxval is interpreted as no capabilities
+	// available, which is safe.
+}
+
+#else // __GLIBC_PREREQ(2, 16)
+#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
+
+static unsigned long get_hwcap(void) {
+	// treat missing support for getauxval() as no capabilities available,
+	// which is safe.
+	return 0;
+}
+#endif // __GLIBC_PREREQ(2, 16)
+#endif // __GLIBC
+
+static int detect(void)
+{
+	unsigned long hwcap = get_hwcap();
+
+	// Choose the architecture level for optimized kernels based on hardware
+	// capability bits (just like glibc chooses optimized implementations).
+	//
+	// The hardware capability bits that are used here indicate both
+	// hardware support for a particular ISA extension and the presence of
+	// software support to enable its use. For example, when HWCAP_S390_VX
+	// is set then both the CPU can execute SIMD instructions and the Linux
+	// kernel can manage applications using the vector registers and SIMD
+	// instructions.
+	//
+	// See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in
+	// sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware
+	// capability bits. They are derived from the information that the
+	// "store facility list (extended)" instructions provide.
+	// (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD)
+	//
+	// currently used:
+	// HWCAP_S390_VX - vector facility for z/Architecture (introduced with
+	//                 IBM z13), enables level CPU_Z13 (SIMD)
+	// HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM
+	//                  z14), together with VX enables level CPU_Z14
+	//                  (single-precision SIMD instructions)
+	//
+	// When you add optimized kernels that make use of other ISA extensions
+	// (e.g., for exploiting the vector-enhancements facility 2 that was introduced
+	// with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate
+	// it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2
+	// for the z15 vector enhancements).
+	//
+	// To learn the value of hwcaps on a given system, set the environment
+	// variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running
+	// LD_SHOW_AUXV=1 /bin/true).
+	// Also, the init function for dynamic arch support will print hwcaps
+	// when OPENBLAS_VERBOSE is set to 2 or higher.
+	if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
+		return CPU_Z14;
+
+	if (hwcap & HWCAP_S390_VX)
+		return CPU_Z13;
+
+	return CPU_GENERIC;
+}
+
diff --git a/ctest.c b/ctest.c
index d674a8cbd..fc52b43a6 100644
--- a/ctest.c
+++ b/ctest.c
@@ -84,7 +84,7 @@ OS_AIX
 OS_OSF
 #endif
 
-#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT)
+#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT)
 OS_WINNT
 #endif
 
@@ -141,7 +141,7 @@ ARCH_SPARC
 ARCH_IA64
 #endif
 
-#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
+#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__)
 BINARY_64
 #endif
 
@@ -157,7 +157,15 @@ ARCH_ARM64
 ARCH_RISCV64
 #endif
 
+#ifdef __loongarch64
+ARCH_LOONGARCH64
+#endif
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 HAVE_C11
 #endif
 
+#if defined(__e2k__)
+ARCH_E2K
+#endif
+
diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index 8aed9eb85..f785d3f90 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -4,10 +4,22 @@ include_directories(${PROJECT_BINARY_DIR})
 enable_language(Fortran)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
+if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
+	set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
+endif()
 
+if(WIN32)
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
+"$ErrorActionPreference = \"Stop\"\n"
+"Get-Content $args[1] | & $args[0]\n"
+)
+set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1")
+else()
 FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
 "$1 < $2\n"
 )
+set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh")
+endif()
 
 foreach(float_type ${FLOAT_TYPES})
   string(SUBSTRING ${float_type} 0 1 float_char_upper)
@@ -21,7 +33,7 @@ foreach(float_type ${FLOAT_TYPES})
     c_${float_char}blas1.c)
   target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
   add_test(NAME "x${float_char}cblat1"
-    COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1")
+    COMMAND $<TARGET_FILE:x${float_char}cblat1>)
 
   #level2
   add_executable(x${float_char}cblat2
@@ -33,7 +45,7 @@ foreach(float_type ${FLOAT_TYPES})
     constant.c)
   target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
   add_test(NAME "x${float_char}cblat2"
-    COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
+    COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
 
   #level3
   add_executable(x${float_char}cblat3
@@ -45,6 +57,6 @@ foreach(float_type ${FLOAT_TYPES})
     constant.c)
   target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
   add_test(NAME "x${float_char}cblat3"
-    COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
+    COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
 
 endforeach()
diff --git a/ctest/Makefile b/ctest/Makefile
index 2a893cae8..c5e1094da 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -6,6 +6,9 @@ TOPDIR = ..
 include $(TOPDIR)/Makefile.system
 
 override CFLAGS += -DADD$(BU) -DCBLAS
+ifeq ($(F_COMPILER),GFORTRAN)
+	override FFLAGS += -fno-tree-vectorize
+endif
 override TARGET_ARCH=
 override TARGET_MACH=
 
@@ -212,6 +215,9 @@ ifeq ($(C_COMPILER), CLANG)
 CEXTRALIB = -lomp
 endif
 endif
+ifeq ($(F_COMPILER), NAG)
+CEXTRALIB = -lgomp
+endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c
index 057096f32..6511e5271 100644
--- a/ctest/c_cblas2.c
+++ b/ctest/c_cblas2.c
@@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A  = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
+     A  = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
 		 *incx, beta, y, *incy );
      else {
         LDA = *k+2;
-        A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 	         beta, y, *incy);
      else {
         LDA = *n;
-        A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
-        AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
+        AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 	        sizeof( CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
 	x, *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
 	         *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
+        A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
-        AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 	 	sizeof(CBLAS_TEST_COMPLEX));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
-        AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
+        AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 		sizeof(CBLAS_TEST_COMPLEX));
      	if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA=*n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
 	  A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
         cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
      else {
         LDA = *n;
-        A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
-        AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+        AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 		sizeof( CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 		     *incy, ap );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
-        AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+        AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
 	sizeof( CBLAS_TEST_COMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
+     A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
@@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
+     A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c
index 423a58748..ae3854c0e 100644
--- a/ctest/c_dblas2.c
+++ b/ctest/c_dblas2.c
@@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
 
      for( i=0; i<*m; i++ ) {
        for( j=0; j<*n; j++ )
@@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
          A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A   = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
         irow=*k-i;
@@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A   = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
+     A   = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( double* )malloc( LDA*LDA*sizeof( double ) );
-     AP  = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
+     A   = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
+     AP  = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
diff --git a/ctest/c_dblas3.c b/ctest/c_dblas3.c
index 85d7913c0..936dea8d9 100644
--- a/ctest/c_dblas3.c
+++ b/ctest/c_dblas3.c
@@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (transa == CblasNoTrans) {
         LDA = *k+1;
-        A = (double *)malloc( (*m)*LDA*sizeof( double ) );
+        A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else {
         LDA = *m+1;
-        A   = ( double* )malloc( LDA*(*k)*sizeof( double ) );
+        A   = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      if (transb == CblasNoTrans) {
         LDB = *n+1;
-        B   = ( double* )malloc( (*k)*LDB*sizeof( double ) );
+        B   = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      else {
         LDB = *k+1;
-        B   = ( double* )malloc( LDB*(*n)*sizeof( double ) );
+        B   = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      LDC = *n+1;
-     C   = ( double* )malloc( (*m)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
      LDC = *n+1;
-     C   = ( double* )malloc( (*m)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*k)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDC = *n+1;
-     C   = ( double* )malloc( (*n)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
-        B   = ( double* )malloc( (*n)*LDB*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
+        B   = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A   = ( double* )malloc( LDA*(*k)*sizeof( double ) );
-        B   = ( double* )malloc( LDB*(*k)*sizeof( double ) );
+        A   = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
+        B   = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C   = ( double* )malloc( (*n)*LDC*sizeof( double ) );
+     C   = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
@@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( double* )malloc( (*m)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( double* )malloc( (*n)*LDA*sizeof( double ) );
+        A   = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( double* )malloc( (*m)*LDB*sizeof( double ) );
+     B   = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
diff --git a/ctest/c_sblas2.c b/ctest/c_sblas2.c
index 6cbc074c7..fea4ca1ab 100644
--- a/ctest/c_sblas2.c
+++ b/ctest/c_sblas2.c
@@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
 
      for( i=0; i<*m; i++ ) {
        for( j=0; j<*n; j++ )
@@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ )
          A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A   = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
         irow=*k-i;
@@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *k+1;
-     A   = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
+     A   = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( i=0; i<*k; i++ ){
            irow=*k-i;
@@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
@@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n;
-     A   = ( float* )malloc( LDA*LDA*sizeof( float ) );
-     AP  = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
+     A   = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
+     AP  = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
      if (uplo == CblasUpper) {
         for( j=0, k=0; j<*n; j++ )
            for( i=0; i<j+1; i++, k++ )
diff --git a/ctest/c_sblas3.c b/ctest/c_sblas3.c
index e3977d026..10dc049a8 100644
--- a/ctest/c_sblas3.c
+++ b/ctest/c_sblas3.c
@@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (transa == CblasNoTrans) {
         LDA = *k+1;
-        A = (float *)malloc( (*m)*LDA*sizeof( float ) );
+        A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else {
         LDA = *m+1;
-        A   = ( float* )malloc( LDA*(*k)*sizeof( float ) );
+        A   = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      if (transb == CblasNoTrans) {
         LDB = *n+1;
-        B   = ( float* )malloc( (*k)*LDB*sizeof( float ) );
+        B   = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      else {
         LDB = *k+1;
-        B   = ( float* )malloc( LDB*(*n)*sizeof( float ) );
+        B   = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               B[i*LDB+j]=b[j*(*ldb)+i];
      }
      LDC = *n+1;
-     C   = ( float* )malloc( (*m)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
      LDC = *n+1;
-     C   = ( float* )malloc( (*m)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*k)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDC = *n+1;
-     C   = ( float* )malloc( (*n)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
-        B   = ( float* )malloc( (*n)*LDB*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
+        B   = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A   = ( float* )malloc( LDA*(*k)*sizeof( float ) );
-        B   = ( float* )malloc( LDB*(*k)*sizeof( float ) );
+        A   = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
+        B   = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
               A[i*LDA+j]=a[j*(*lda)+i];
@@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C   = ( float* )malloc( (*n)*LDC*sizeof( float ) );
+     C   = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
@@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A   = ( float* )malloc( (*m)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A   = ( float* )malloc( (*n)*LDA*sizeof( float ) );
+        A   = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B   = ( float* )malloc( (*m)*LDB*sizeof( float ) );
+     B   = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
diff --git a/ctest/c_zblas2.c b/ctest/c_zblas2.c
index 8854dcc6d..ca601e606 100644
--- a/ctest/c_zblas2.c
+++ b/ctest/c_zblas2.c
@@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A  = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
+     A  = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
   get_transpose_type(transp, &trans);
   if (*order == TEST_ROW_MJR) {
      LDA = *ku+*kl+2;
-     A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*ku; i++ ){
         irow=*ku+*kl-i;
         jcol=(*ku)-i;
@@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ){
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
 		 *incx, beta, y, *incy );
      else {
         LDA = *k+2;
-        A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 	         beta, y, *incy);
      else {
         LDA = *n;
-        A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
-        AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 	        sizeof( CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
 	x, *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
 	         *incx);
      else {
         LDA = *k+2;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( i=0; i<*k; i++ ){
               irow=*k-i;
@@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 	 	sizeof(CBLAS_TEST_ZOMPLEX));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
         cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
 		sizeof(CBLAS_TEST_ZOMPLEX));
      	if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA=*n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
 	  A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
         cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
      else {
         LDA = *n;
-        A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
+        A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
 		sizeof( CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 		     *incy, ap );
      else {
         LDA = *n;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
+        A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
 	sizeof( CBLAS_TEST_ZOMPLEX ));
         if (uplo == CblasUpper) {
            for( j=0, k=0; j<*n; j++ )
@@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
+     A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
@@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
 
   if (*order == TEST_ROW_MJR) {
      LDA = *n+1;
-     A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
 
      for( i=0; i<*n; i++ )
        for( j=0; j<*n; j++ ) {
diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c
index 40afa4edf..aac46ddfa 100644
--- a/ctest/c_zblas3.c
+++ b/ctest/c_zblas3.c
@@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (transa == CblasNoTrans) {
         LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
      }
      else {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*k; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
 
      if (transb == CblasNoTrans) {
         LDB = *n+1;
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ) {
               B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
      }
      else {
         LDB = *k+1;
-        B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
      }
 
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
            }
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ) {
            B[i*LDB+j].real=b[j*(*ldb)+i].real;
            B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
         }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ )
               A[i*LDA+j]=a[j*(*lda)+i];
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ )
            B[i*LDB+j]=b[j*(*ldb)+i];
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
      for( j=0; j<*n; j++ )
         for( i=0; i<*m; i++ )
            C[i*LDC+j]=c[j*(*ldc)+i];
@@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
   if (*order == TEST_ROW_MJR) {
      if (trans == CblasNoTrans) {
         LDA = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
-        B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
 	      A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
+     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      if (trans == CblasNoTrans) {
         LDA = *k+1;
         LDB = *k+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*k; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
      else {
         LDA = *n+1;
         LDB = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
-        B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+        B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*k; i++ )
            for( j=0; j<*n; j++ ){
 	      A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
            }
      }
      LDC = *n+1;
-     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+     C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*n; i++ )
         for( j=0; j<*n; j++ ) {
            C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
            }
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ) {
            B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
   if (*order == TEST_ROW_MJR) {
      if (side == CblasLeft) {
         LDA = *m+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
+        A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
         for( i=0; i<*m; i++ )
            for( j=0; j<*m; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
      }
      else{
         LDA = *n+1;
-        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+        A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
         for( i=0; i<*n; i++ )
            for( j=0; j<*n; j++ ) {
               A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
            }
      }
      LDB = *n+1;
-     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
+     B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
      for( i=0; i<*m; i++ )
         for( j=0; j<*n; j++ ) {
            B[i*LDB+j].real=b[j*(*ldb)+i].real;
diff --git a/ctest/constant.c b/ctest/constant.c
index 861d70bcc..5a26a4dde 100644
--- a/ctest/constant.c
+++ b/ctest/constant.c
@@ -1,3 +1,4 @@
+#include "cblas_test.h"
 int CBLAS_CallFromC;
 int RowMajorStrg;
 
diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt
index 61367e596..3e9964ab1 100644
--- a/driver/level2/CMakeLists.txt
+++ b/driver/level2/CMakeLists.txt
@@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
     GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
   endif ()
 
+# special defines for complex
   if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
 
     foreach (u_source ${U_SOURCES})
@@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
   endif ()
 endforeach ()
 
+if (BUILD_BFLOAT16)
+  if (USE_THREAD)
+    GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
+    GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
+  endif ()
+endif ()
+
 if ( BUILD_COMPLEX AND NOT  BUILD_SINGLE)
   if (USE_THREAD)
 	  GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")
diff --git a/driver/level2/Makefile b/driver/level2/Makefile
index caecf4f97..9bef6e2a5 100644
--- a/driver/level2/Makefile
+++ b/driver/level2/Makefile
@@ -64,9 +64,9 @@ CBLASOBJS += \
 	chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \
 	chpr_U.$(SUFFIX)  chpr_L.$(SUFFIX)  chpr_V.$(SUFFIX)  chpr_M.$(SUFFIX) \
 	chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \
-	csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
-	cspr_U.$(SUFFIX)  cspr_L.$(SUFFIX)  cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
-	csyr_U.$(SUFFIX)  csyr_L.$(SUFFIX)  csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
+	csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \
+	cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
+	csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
 	ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \
 	ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \
 	ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \
@@ -92,6 +92,13 @@ CBLASOBJS += \
 	ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \
 	ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX)
 
+ifndef NO_LAPACK
+CBLASOBJS += \
+	cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
+	cspr_U.$(SUFFIX)  cspr_L.$(SUFFIX)  \
+	csyr_U.$(SUFFIX)  csyr_L.$(SUFFIX)  
+endif
+
 ZBLASOBJS += \
 	zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \
 	zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \
diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt
index 077862abc..75b25d039 100644
--- a/driver/level3/CMakeLists.txt
+++ b/driver/level3/CMakeLists.txt
@@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
   if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
     GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
   endif ()
+  if (BUILD_BFLOAT16)
+    GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
+    if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
+      GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
+    endif ()
+  endif ()
 endforeach ()
 
 if ( BUILD_COMPLEX16 AND NOT  BUILD_DOUBLE)
diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index 78f32b961..b8465d4ed 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm_rc.$(SUFFIX) : gemm.c level3.c  ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm_cn.$(SUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm_rc.$(PSUFFIX) : gemm.c level3.c  ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm_cn.$(PSUFFIX) : gemm.c level3.c  ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
 
 xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
-	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
+	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
 
 xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index a38506585..4a8e193be 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -333,14 +333,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
         else
-        	if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
+/*
+		if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
         	else
+*/
           		if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
 
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 6e1fd9e99..dfc7107b8 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -367,14 +367,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       /* Split local region of B into parts */
       for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, js + div_n) - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
 	if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
 	else
+/*
           if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
           else
+*/
             if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
         /* Copy part of local region of B into workspace */
diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c
index 880de4df4..e25ea7afe 100644
--- a/driver/level3/trmm_L.c
+++ b/driver/level3/trmm_L.c
@@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
       /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
       if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c
index 3be43edde..ab9cdfae8 100644
--- a/driver/level3/trmm_R.c
+++ b/driver/level3/trmm_R.c
@@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < ls - js; jjs += min_jj){
 	min_jj = ls - js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
 	min_jj = js - ls - min_l - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
@@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
 #else
diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt
index a07e00b3b..1a38740a3 100644
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 ""
 if (DYNAMIC_ARCH)
   if (ARM64)
     list(APPEND COMMON_SOURCES dynamic_arm64.c)
+  elseif (POWER)
+    list(APPEND COMMON_SOURCES dynamic_power.c)
   else ()  
     list(APPEND COMMON_SOURCES dynamic.c)
   endif ()  
diff --git a/driver/others/Makefile b/driver/others/Makefile
index d09444f56..4a421ef31 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -24,10 +24,14 @@ else
 ifeq ($(ARCH),zarch)
 COMMONOBJS += dynamic_zarch.$(SUFFIX)
 else
+ifeq ($(ARCH),mips64)
+COMMONOBJS += dynamic_mips64.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
 endif
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
@@ -92,10 +96,14 @@ else
 ifeq ($(ARCH),zarch)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
 else
+ifeq ($(ARCH),mips64)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
 endif
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 30e0cc6c2..ec79075fe 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	    /* REAL / Double */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
 			  double *, BLASLONG, double *, BLASLONG,
-			  double *, BLASLONG, void *) = func;
+			  double *, BLASLONG, void *) =  (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, 
+			  double *, BLASLONG, double *, BLASLONG, void *)) func;
 
 	    afunc(args -> m, args -> n, args -> k,
 		  ((double *)args -> alpha)[0],
@@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / Single */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, float *, BLASLONG,
-                          float *, BLASLONG, void *) = func;
+                          float *, BLASLONG, void *) = (void (*)
+                          (BLASLONG, BLASLONG, BLASLONG, float,
+                          float *, BLASLONG, float *, BLASLONG,
+                          float *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((float *)args -> alpha)[0],
@@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / BFLOAT16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
-                          bfloat16 *, BLASLONG, void *) = func;
+                          bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
+                          bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
+                          bfloat16 *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((bfloat16 *)args -> alpha)[0],
@@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / BLAS_STOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, bfloat16 *, BLASLONG,
-                          float *, BLASLONG, void *) = func;
+                          float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float,
+                          float *, BLASLONG, bfloat16 *, BLASLONG,
+                          float *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((float *)args -> alpha)[0],
@@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
             /* REAL / BLAS_DTOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
                           double *, BLASLONG, bfloat16 *, BLASLONG,
-                          double *, BLASLONG, void *) = func;
+                          double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double,
+                          double *, BLASLONG, bfloat16 *, BLASLONG,
+                          double *, BLASLONG, void *)) func;
 
             afunc(args -> m, args -> n, args -> k,
                   ((double *)args -> alpha)[0],
@@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	  /* COMPLEX / Extended Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
 			xdouble *, BLASLONG, xdouble *, BLASLONG,
-			xdouble *, BLASLONG, void *) = func;
+			xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
+                        xdouble *, BLASLONG, xdouble *, BLASLONG,
+                        xdouble *, BLASLONG, void *)) func;
 
 	  afunc(args -> m, args -> n, args -> k,
 		((xdouble *)args -> alpha)[0],
@@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	    /* COMPLEX / Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
 			double *, BLASLONG, double *, BLASLONG,
-			double *, BLASLONG, void *) = func;
+			double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double,
+                        double *, BLASLONG, double *, BLASLONG,
+                        double *, BLASLONG, void *)) func;
 
 	  afunc(args -> m, args -> n, args -> k,
 		((double *)args -> alpha)[0],
@@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	    /* COMPLEX / Single */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
 			float *, BLASLONG, float *, BLASLONG,
-			float *, BLASLONG, void *) = func;
+			float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float,
+                        float *, BLASLONG, float *, BLASLONG,
+                        float *, BLASLONG, void *)) func;
 
 	  afunc(args -> m, args -> n, args -> k,
 		((float *)args -> alpha)[0],
@@ -425,7 +441,7 @@ blas_queue_t *tscq;
 #endif
 
     if (queue) {
-      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
+      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
 
       atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
 
@@ -503,7 +519,7 @@ blas_queue_t *tscq;
 	legacy_exec(routine, queue -> mode, queue -> args, sb);
       } else
 	if (queue -> mode & BLAS_PTHREAD) {
-	  void (*pthreadcompat)(void *) = queue -> routine;
+	  void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
 	  (pthreadcompat)(queue -> args);
 	} else
 	  (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
@@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
   fprintf(STDERR, "\n");
 #endif
 
-  routine = queue -> routine;
+  routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine;
 
   if (queue -> mode & BLAS_LEGACY) {
     legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
   } else
     if (queue -> mode & BLAS_PTHREAD) {
-      void (*pthreadcompat)(void *) = queue -> routine;
+      void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine;
       (pthreadcompat)(queue -> args);
     } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
@@ -967,9 +983,11 @@ void goto_set_num_threads(int num_threads) {
   blas_cpu_number  = num_threads;
 
 #if defined(ARCH_MIPS64)
+#ifndef DYNAMIC_ARCH
   //set parameters for different number of threads.
   blas_set_parameter();
 #endif
+#endif
 
 }
 
@@ -1022,38 +1040,39 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
   int i;
 
-  if (!blas_server_avail) return 0;
-
   LOCK_COMMAND(&server_lock);
 
-  for (i = 0; i < blas_num_threads - 1; i++) {
+  if (blas_server_avail) {
 
+    for (i = 0; i < blas_num_threads - 1; i++) {
 
-    pthread_mutex_lock (&thread_status[i].lock);
 
-    atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
-    thread_status[i].status = THREAD_STATUS_WAKEUP;
-    pthread_cond_signal (&thread_status[i].wakeup);
+      pthread_mutex_lock (&thread_status[i].lock);
 
-    pthread_mutex_unlock(&thread_status[i].lock);
+      atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
+      thread_status[i].status = THREAD_STATUS_WAKEUP;
+      pthread_cond_signal (&thread_status[i].wakeup);
 
-  }
+      pthread_mutex_unlock(&thread_status[i].lock);
 
-  for(i = 0; i < blas_num_threads - 1; i++){
-    pthread_join(blas_threads[i], NULL);
-  }
+    }
 
-  for(i = 0; i < blas_num_threads - 1; i++){
-    pthread_mutex_destroy(&thread_status[i].lock);
-    pthread_cond_destroy (&thread_status[i].wakeup);
-  }
+    for(i = 0; i < blas_num_threads - 1; i++){
+      pthread_join(blas_threads[i], NULL);
+    }
+
+    for(i = 0; i < blas_num_threads - 1; i++){
+      pthread_mutex_destroy(&thread_status[i].lock);
+      pthread_cond_destroy (&thread_status[i].wakeup);
+    }
 
 #ifdef NEED_STACKATTR
-  pthread_attr_destory(&attr);
+    pthread_attr_destroy(&attr);
 #endif
 
-  blas_server_avail = 0;
+    blas_server_avail = 0;
 
+  }
   UNLOCK_COMMAND(&server_lock);
 
   return 0;
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 42f289441..33b58f134 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -40,7 +40,7 @@
 #include <stdlib.h>
 #include "common.h"
 
-#if defined(OS_CYGWIN_NT) && !defined(unlikely)
+#if !defined(unlikely)
 #ifdef __GNUC__
 #define unlikely(x) __builtin_expect(!!(x), 0)
 #else
@@ -391,8 +391,9 @@ int blas_thread_init(void){
 
 int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 
-#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
+#if defined(SMP_SERVER)
   // Handle lazy re-init of the thread-pool after a POSIX fork
+  // on Cygwin or as delayed init when a static library	is used
   if (unlikely(blas_server_avail == 0)) blas_thread_init();
 #endif
 
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 58f4d8b59..52a7c6087 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -292,6 +292,7 @@ extern gotoblas_t  gotoblas_COOPERLAKE;
 #define VENDOR_AMD        2
 #define VENDOR_CENTAUR    3
 #define VENDOR_HYGON	  4
+#define VENDOR_ZHAOXIN    5
 #define VENDOR_UNKNOWN   99
 
 #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -404,6 +405,7 @@ static int get_vendor(void){
   if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
   if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
   if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
+  if (!strcmp(vendor.vchar, "  Shanghai  ")) return VENDOR_ZHAOXIN;
   if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
 
   if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -414,7 +416,7 @@ static int get_vendor(void){
 static gotoblas_t *get_coretype(void){
 
   int eax, ebx, ecx, edx;
-  int family, exfamily, model, vendor, exmodel;
+  int family, exfamily, model, vendor, exmodel, stepping;
 
   cpuid(1, &eax, &ebx, &ecx, &edx);
 
@@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){
   exfamily = BITMASK(eax, 20, 0xff);
   model    = BITMASK(eax,  4, 0x0f);
   exmodel  = BITMASK(eax, 16, 0x0f);
+  stepping = BITMASK(eax,  0, 0x0f);
 
   vendor = get_vendor();
 
@@ -621,11 +624,27 @@ static gotoblas_t *get_coretype(void){
 	    return &gotoblas_NEHALEM;
 	  }
         }
+	if (model == 10 || model == 12){
+          // Ice Lake SP
+	   if(support_avx512_bf16())
+             return &gotoblas_COOPERLAKE;
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM;
+	  }
+        }
         return NULL;  
       case 7:
 	if (model == 10) // Goldmont Plus
 	   return &gotoblas_NEHALEM;
-        if (model == 14) {
+        if (model == 13 || model == 14) {
 	// Ice Lake
           if (support_avx512()) 
 	    return &gotoblas_SKYLAKEX;
@@ -642,8 +661,68 @@ static gotoblas_t *get_coretype(void){
           }
         }
         return NULL;  
-      case 9:
       case 8:
+        if (model == 12 || model == 13) { // Tiger Lake
+          if (support_avx512()) 
+            return &gotoblas_SKYLAKEX;
+          if(support_avx2()){
+            openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
+            return &gotoblas_HASWELL;
+          }
+          if(support_avx()) {
+            openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+            return &gotoblas_SANDYBRIDGE;
+          } else {
+          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+          return &gotoblas_NEHALEM;
+          }
+        }
+	if (model == 14 ) { // Kaby Lake, Coffee Lake
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	if (model == 15){          // Sapphire Rapids
+	   if(support_avx512_bf16())
+             return &gotoblas_COOPERLAKE;
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM;
+	  }
+        }
+	return NULL;
+	
+	
+      case 9:
+        if (model == 7 || model == 10) { // Alder Lake
+	   if(support_avx512_bf16())
+             return &gotoblas_COOPERLAKE;
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+          if(support_avx2()){
+            return &gotoblas_HASWELL;
+          }
+          if(support_avx()) {
+            openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+            return &gotoblas_SANDYBRIDGE;
+          } else {
+          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+          return &gotoblas_NEHALEM;
+          }
+        }
 	if (model == 14 ) { // Kaby Lake, Coffee Lake
 	  if(support_avx2())
 	    return &gotoblas_HASWELL;
@@ -655,8 +734,9 @@ static gotoblas_t *get_coretype(void){
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
 	}
+	return NULL;
       case 10:
-    if (model == 5 || model == 6) {
+        if (model == 5 || model == 6) {
 	  if(support_avx2())
 	    return &gotoblas_HASWELL;
 	  if(support_avx()) {
@@ -666,7 +746,20 @@ static gotoblas_t *get_coretype(void){
 	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 	  }
-    }
+        }
+        if (model == 7) {
+	  if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+        }      
 	return NULL;
       }
       case 0xf:
@@ -779,10 +872,19 @@ static gotoblas_t *get_coretype(void){
   if (vendor == VENDOR_CENTAUR) {
     switch (family) {
     case 0x6:
-      return &gotoblas_NANO;
+      if (model == 0xf && stepping < 0xe)
+        return &gotoblas_NANO;
+      return &gotoblas_NEHALEM;
+    default:
+      if (family >= 0x7)
+        return &gotoblas_NEHALEM;
     }
   }
 
+  if (vendor == VENDOR_ZHAOXIN) {
+      return &gotoblas_NEHALEM;
+  }
+
   return NULL;
 }
 
@@ -962,7 +1064,13 @@ void gotoblas_dynamic_init(void) {
 #ifdef ARCH_X86
   if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
 #else
-  if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
+  if (gotoblas == NULL) {
+   if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE;
+   else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX;
+   else if   (support_avx2()) gotoblas = &gotoblas_HASWELL;
+   else if    (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE;
+   else                       gotoblas = &gotoblas_PRESCOTT;
+  }
   /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
   if (sizeof(void*) == 8) {
       if (gotoblas == &gotoblas_KATMAI ||
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 4f1b12f27..45ea9f113 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -43,6 +43,68 @@
 #endif
 
 extern gotoblas_t  gotoblas_ARMV8;
+#ifdef DYNAMIC_LIST
+#ifdef DYN_CORTEXA53
+extern gotoblas_t  gotoblas_CORTEXA53;
+#else
+#define gotoblas_CORTEXA53 gotoblas_ARMV8
+#endif
+#ifdef DYN_CORTEXA57
+extern gotoblas_t  gotoblas_CORTEXA57;
+#else
+#define gotoblas_CORTEXA57 gotoblas_ARMV8
+#endif
+#ifdef DYN_CORTEXA72
+extern gotoblas_t  gotoblas_CORTEXA72;
+#else
+#define gotoblas_CORTEXA72 gotoblas_ARMV8
+#endif
+#ifdef DYN_CORTEXA73
+extern gotoblas_t  gotoblas_CORTEXA73;
+#else
+#define gotoblas_CORTEXA73 gotoblas_ARMV8
+#endif
+#ifdef DYN_FALKOR
+extern gotoblas_t  gotoblas_FALKOR;
+#else
+#define gotoblas_FALKOR gotoblas_ARMV8
+#endif
+#ifdef DYN_TSV110
+extern gotoblas_t  gotoblas_TSV110;
+#else
+#define gotoblas_TSV110 gotoblas_ARMV8
+#endif
+#ifdef DYN_THUNDERX
+extern gotoblas_t  gotoblas_THUNDERX;
+#else
+#define gotoblas_THUNDERX gotoblas_ARMV8
+#endif
+#ifdef DYN_THUNDERX2T99
+extern gotoblas_t  gotoblas_THUNDERX2T99;
+#else
+#define gotoblas_THUNDERX2T99 gotoblas_ARMV8
+#endif
+#ifdef DYN_THUNDERX3T110
+extern gotoblas_t  gotoblas_THUNDERX3T110;
+#else
+#define gotoblas_THUNDERX3T110 gotoblas_ARMV8
+#endif
+#ifdef DYN_EMAG8180
+extern gotoblas_t  gotoblas_EMAG8180;
+#else
+#define gotoblas_EMAG8180 gotoblas_ARMV8
+#endif
+#ifdef DYN_NEOVERSEN1
+extern gotoblas_t  gotoblas_NEOVERSEN1;
+#else
+#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
+#endif
+#ifdef DYN_CORTEX_A55
+extern gotoblas_t  gotoblas_CORTEXA55;
+#else
+#define gotoblas_CORTEXA55 gotoblas_ARMV8
+#endif
+#else
 extern gotoblas_t  gotoblas_CORTEXA53;
 extern gotoblas_t  gotoblas_CORTEXA57;
 extern gotoblas_t  gotoblas_CORTEXA72;
@@ -54,10 +116,12 @@ extern gotoblas_t  gotoblas_TSV110;
 extern gotoblas_t  gotoblas_EMAG8180;
 extern gotoblas_t  gotoblas_NEOVERSEN1;
 extern gotoblas_t  gotoblas_THUNDERX3T110;
+extern gotoblas_t  gotoblas_CORTEXA55;
+#endif
 
 extern void openblas_warning(int verbose, const char * msg);
 
-#define NUM_CORETYPES   12
+#define NUM_CORETYPES   13
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -68,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg);
 #endif
 
 #define get_cpu_ftr(id, var) ({					\
-		__asm__("mrs %0, "#id : "=r" (var));		\
+		__asm__ __volatile__ ("mrs %0, "#id : "=r" (var));		\
 	})
 
 static char *corename[] = {
@@ -83,7 +147,10 @@ static char *corename[] = {
   "tsv110",
   "emag8180",
   "neoversen1",
+  "neoversev1",
+  "neoversen2",
   "thunderx3t110",
+  "cortexa55",
   "unknown"
 };
 
@@ -100,6 +167,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_EMAG8180)     return corename[ 9];
   if (gotoblas == &gotoblas_NEOVERSEN1)   return corename[10];
   if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
+  if (gotoblas == &gotoblas_CORTEXA55)    return corename[12];
   return corename[NUM_CORETYPES];
 }
 
@@ -131,6 +199,7 @@ static gotoblas_t *force_coretype(char *coretype) {
     case  9: return (&gotoblas_EMAG8180);
     case 10: return (&gotoblas_NEOVERSEN1);
     case 11: return (&gotoblas_THUNDERX3T110);
+    case 12: return (&gotoblas_CORTEXA55);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -189,6 +258,8 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_CORTEXA73;
         case 0xd0c: // Neoverse N1
           return &gotoblas_NEOVERSEN1;
+	case 0xd05: // Cortex A55
+	  return &gotoblas_CORTEXA55;
       }
       break;
     case 0x42: // Broadcom
diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c
new file mode 100644
index 000000000..9fd19d739
--- /dev/null
+++ b/driver/others/dynamic_mips64.c
@@ -0,0 +1,230 @@
+/*****************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#include <sys/wait.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include "common.h"
+
+extern gotoblas_t  gotoblas_LOONGSON3R3;
+extern gotoblas_t  gotoblas_LOONGSON3R4;
+
+extern void openblas_warning(int verbose, const char * msg);
+
+#define NUM_CORETYPES    2
+
+static char *corename[] = {
+  "loongson3r3",
+  "loongson3r4",
+  "UNKNOWN"
+};
+
+char *gotoblas_corename(void) {
+  if (gotoblas == &gotoblas_LOONGSON3R3)    return corename[0];
+  if (gotoblas == &gotoblas_LOONGSON3R4)    return corename[1];
+  return corename[NUM_CORETYPES];
+}
+
+static gotoblas_t *force_coretype(char *coretype) {
+  int i;
+  int found = -1;
+  char message[128];
+
+  for ( i=0 ; i < NUM_CORETYPES; i++)
+  {
+    if (!strncasecmp(coretype, corename[i], 20))
+    {
+        found = i;
+        break;
+    }
+  }
+
+  switch (found)
+  {
+    case  0: return (&gotoblas_LOONGSON3R3);
+    case  1: return (&gotoblas_LOONGSON3R4);
+  }
+  snprintf(message, 128, "Core not found: %s\n", coretype);
+  openblas_warning(1, message);
+  return NULL;
+}
+
+#define MMI_MASK    0x00000010
+#define MSA_MASK    0x00000020
+
+int fd[2];
+int support_cpucfg;
+
+static void handler(int signum)
+{
+    close(fd[1]);
+    exit(1);
+}
+
+/* Brief :  Function to check if cpucfg supported on loongson
+ * Return:  1   supported
+ *          0   not supported
+ */
+static int cpucfg_test(void) {
+    pid_t pid;
+    int status = 0;
+
+    support_cpucfg = 0;
+    pipe(fd);
+    pid = fork();
+    if (pid == 0) { /* Subprocess */
+        struct sigaction act;
+        close(fd[0]);
+        /* Set signal action for SIGILL. */
+        act.sa_handler = handler;
+        sigaction(SIGILL,&act,NULL);
+
+        /* Execute cpucfg in subprocess. */
+        __asm__ volatile(
+            ".insn              \n\t"
+            ".word (0xc8080118) \n\t"
+            :::
+        );
+        support_cpucfg = 1;
+        write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
+        close(fd[1]);
+        exit(0);
+    } else if (pid > 0){ /* Parent process*/
+        close(fd[1]);
+        if ((waitpid(pid,&status,0) <= 0) ||
+            (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
+            support_cpucfg = 0;
+        close(fd[0]);
+    } else {
+        support_cpucfg = 0;
+    }
+
+    return support_cpucfg;
+}
+
+static gotoblas_t *get_coretype_from_cpucfg(void) {
+    int flag = 0;
+    __asm__ volatile(
+        ".insn                     \n\t"
+        "dli    $8,    0x01        \n\t"
+        ".word (0xc9084918)        \n\t"
+        "usw    $9,    0x00(%0)    \n\t"
+        :
+        : "r"(&flag)
+        : "memory"
+    );
+    if (flag & MSA_MASK)
+        return (&gotoblas_LOONGSON3R4);
+    if (flag & MMI_MASK)
+        return (&gotoblas_LOONGSON3R3);
+    return NULL;
+}
+
+static gotoblas_t *get_coretype_from_cpuinfo(void) {
+#ifdef linux
+  FILE *infile;
+  char buffer[512], *p;
+
+  p = (char *)NULL;
+  //Check model name for Loongson3
+  infile = fopen("/proc/cpuinfo", "r");
+  while (fgets(buffer, sizeof(buffer), infile)){
+    if (!strncmp("model name", buffer, 10)){
+      p = strchr(buffer, ':') + 2;
+      break;
+    }
+  }
+  fclose(infile);
+  if(p != NULL){
+   if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000"))
+     return (&gotoblas_LOONGSON3R3);
+   else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000"))
+     return (&gotoblas_LOONGSON3R4);
+   else
+     return NULL;
+  }
+#endif
+    return NULL;
+}
+
+static gotoblas_t *get_coretype(void) {
+    int ret = 0;
+
+    ret = cpucfg_test();
+    if (ret == 1)
+        return get_coretype_from_cpucfg();
+    else
+        return get_coretype_from_cpuinfo();
+}
+
+void gotoblas_dynamic_init(void) {
+  char coremsg[128];
+  char coren[22];
+  char *p;
+
+  if (gotoblas) return;
+
+  p = getenv("OPENBLAS_CORETYPE");
+  if ( p )
+  {
+    gotoblas = force_coretype(p);
+  }
+  else
+  {
+    gotoblas = get_coretype();
+  }
+
+  if (gotoblas == NULL)
+  {
+    snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
+    openblas_warning(1, coremsg);
+    gotoblas = &gotoblas_LOONGSON3R3;
+  }
+
+  if (gotoblas && gotoblas->init) {
+    strncpy(coren, gotoblas_corename(), 20);
+    sprintf(coremsg, "Core: %s\n", coren);
+    openblas_warning(2, coremsg);
+    gotoblas -> init();
+  } else {
+    openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+    exit(1);
+  }
+
+}
+
+void gotoblas_dynamic_quit(void) {
+  gotoblas = NULL;
+}
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 85fc5b3ba..2847ea9ae 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 extern gotoblas_t gotoblas_POWER9;
 #endif
-//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
-//     || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-//#define HAVE_P10_SUPPORT 1
-//#endif
 #ifdef HAVE_P10_SUPPORT
 extern gotoblas_t gotoblas_POWER10;
 #endif
@@ -27,7 +23,9 @@ static char *corename[] = {
 #define NUM_CORETYPES 4
 
 char *gotoblas_corename(void) {
+#ifndef C_PGI
 	if (gotoblas == &gotoblas_POWER6)	return corename[1];
+#endif
 	if (gotoblas == &gotoblas_POWER8)	return corename[2];
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	if (gotoblas == &gotoblas_POWER9)	return corename[3];
@@ -38,10 +36,164 @@ char *gotoblas_corename(void) {
 	return corename[0];
 }
 
+#if defined(__clang__)
+static int __builtin_cpu_supports(char* arg) 
+{
+	return 0;
+}
+#endif
+
+#if defined(C_PGI) || defined(__clang__)
+/*
+ * NV HPC compilers do not yet implement __builtin_cpu_is().
+ * Fake a version here for use in the CPU detection code below.
+ *
+ * Strategy here is to first check the CPU to see what it actually is,
+ * and then test the input to see if what the CPU actually is matches
+ * what was requested.
+ */
+
+#include <string.h>
+
+/*
+ *  Define POWER processor version table.
+ *
+ *  NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
+ */
+
+#define CPU_UNKNOWN 0
+#define CPU_POWER5 5
+#define CPU_POWER6 6
+#define CPU_POWER8 8
+#define CPU_POWER9 9
+#define CPU_POWER10 10
+
+static  struct {
+    uint32_t    pvr_mask;
+    uint32_t    pvr_value;
+    const char* cpu_name;
+    uint32_t    cpu_type;
+} pvrPOWER [] = {
+
+    {   /* POWER6 in P5+ mode; 2.04-compliant processor */
+        .pvr_mask       = 0xffffffff,
+        .pvr_value      = 0x0f000001,
+        .cpu_name       = "POWER5+",
+        .cpu_type       = CPU_POWER5,
+    },
+
+    {   /* Power6 aka POWER6X*/
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x003e0000,
+        .cpu_name       = "POWER6 (raw)",
+        .cpu_type       = CPU_POWER6,
+    },
+
+    {   /* Power7 */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x003f0000,
+        .cpu_name       = "POWER7 (raw)",
+        .cpu_type       = CPU_POWER6,
+    },
+
+    {   /* Power7+ */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004A0000,
+        .cpu_name       = "POWER7+ (raw)",
+        .cpu_type       = CPU_POWER6,
+    },
+
+    {   /* Power8E */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004b0000,
+        .cpu_name       = "POWER8E (raw)",
+        .cpu_type       = CPU_POWER8,
+    },
+
+    {   /* Power8NVL */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004c0000,
+        .cpu_name       = "POWER8NVL (raw)",
+        .cpu_type       = CPU_POWER8,
+    },
+
+    {   /* Power8 */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004d0000,
+        .cpu_name       = "POWER8 (raw)",
+        .cpu_type       = CPU_POWER8,
+    },
+
+    {   /* Power9 DD2.0 */
+        .pvr_mask       = 0xffffefff,
+        .pvr_value      = 0x004e0200,
+        .cpu_name       = "POWER9 (raw)",
+        .cpu_type       = CPU_POWER9,
+    },
+
+    {   /* Power9 DD 2.1 */
+        .pvr_mask       = 0xffffefff,
+        .pvr_value      = 0x004e0201,
+        .cpu_name       = "POWER9 (raw)",
+        .cpu_type       = CPU_POWER9,
+    },
+
+    {   /* Power9 DD2.2 or later */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004e0000,
+        .cpu_name       = "POWER9 (raw)",
+        .cpu_type       = CPU_POWER9,
+    },
+
+    {   /* Power10 */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x00800000,
+        .cpu_name       = "POWER10 (raw)",
+        .cpu_type       = CPU_POWER10,
+    },
+
+    {   /* End of table, pvr_mask and pvr_value must be zero */
+        .pvr_mask       = 0x0,
+        .pvr_value      = 0x0,
+        .cpu_name       = "Unknown",
+        .cpu_type       = CPU_UNKNOWN,
+    },
+};
+
+static int __builtin_cpu_is(const char *cpu) {
+	int i;
+	uint32_t pvr;
+	uint32_t cpu_type;
+
+	asm("mfpvr    %0" : "=r"(pvr));
+
+	for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
+		if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
+			break;
+		}
+	}
+
+#if defined(DEBUG)
+	printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
+		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
+#endif
+	cpu_type = pvrPOWER[i].cpu_type;
+
+	if (!strcmp(cpu, "power8"))
+		return cpu_type == CPU_POWER8;
+	if (!strcmp(cpu, "power9"))
+		return cpu_type == CPU_POWER9;
+	return 0;
+}
+
+#endif  /* C_PGI */
+
 static gotoblas_t *get_coretype(void) {
 
+#ifndef C_PGI
 	if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
 		return &gotoblas_POWER6;
+#endif
 	if (__builtin_cpu_is("power8"))
 		return &gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
@@ -52,6 +204,11 @@ static gotoblas_t *get_coretype(void) {
 	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
 		return &gotoblas_POWER10;
 #endif
+	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
+	if (__builtin_cpu_is("power10"))
+		return &gotoblas_POWER9;
+#endif	
 	return NULL;
 }
 
@@ -72,7 +229,9 @@ static gotoblas_t *force_coretype(char * coretype) {
 
 	switch (found)
 	{
+#ifndef C_PGI
 	case  1: return (&gotoblas_POWER6);
+#endif
 	case  2: return (&gotoblas_POWER8);
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	case  3: return (&gotoblas_POWER9);
diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
index bf5eab9b2..5b45aae2f 100644
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -1,38 +1,7 @@
 #include "common.h"
+#include "cpuid_zarch.h"
 #include <stdbool.h>
 
-// Guard the use of getauxval() on glibc version >= 2.16
-#ifdef __GLIBC__
-#include <features.h>
-#if __GLIBC_PREREQ(2, 16)
-#include <sys/auxv.h>
-#define HAVE_GETAUXVAL 1
-
-static unsigned long get_hwcap(void)
-{
-	unsigned long hwcap = getauxval(AT_HWCAP);
-	char *maskenv;
-
-	// honor requests for not using specific CPU features in LD_HWCAP_MASK
-	maskenv = getenv("LD_HWCAP_MASK");
-	if (maskenv)
-		hwcap &= strtoul(maskenv, NULL, 0);
-
-	return hwcap;
-	// note that a missing auxval is interpreted as no capabilities
-	// available, which is safe.
-}
-
-#else // __GLIBC_PREREQ(2, 16)
-#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
-
-static unsigned long get_hwcap(void) {
-	// treat missing support for getauxval() as no capabilities available,
-	// which is safe.
-	return 0;
-}
-#endif // __GLIBC_PREREQ(2, 16)
-#endif // __GLIBC
 
 extern gotoblas_t gotoblas_ZARCH_GENERIC;
 #ifdef DYN_Z13
@@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14;
 
 #define NUM_CORETYPES 4
 
+extern int openblas_verbose();
 extern void openblas_warning(int verbose, const char* msg);
 
-static char* corename[] = {
-	"unknown",
-	"Z13",
-	"Z14",
-	"ZARCH_GENERIC",
-};
-
 char* gotoblas_corename(void) {
 #ifdef DYN_Z13
-	if (gotoblas == &gotoblas_Z13)	return corename[1];
+	if (gotoblas == &gotoblas_Z13)	return cpuname[CPU_Z13];
 #endif
 #ifdef DYN_Z14
-	if (gotoblas == &gotoblas_Z14)	return corename[2];
+	if (gotoblas == &gotoblas_Z14)	return cpuname[CPU_Z14];
 #endif
-	if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
+	if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC];
 
-	return corename[0];
+	return "unknown";
 }
 
 #ifndef HWCAP_S390_VXE
@@ -79,25 +42,28 @@ char* gotoblas_corename(void) {
  */
 static gotoblas_t* get_coretype(void) {
 
-	unsigned long hwcap __attribute__((unused)) = get_hwcap();
+	int cpu = detect();
 
-#ifdef DYN_Z14
+	switch(cpu) {
 	// z14 and z15 systems: exploit Vector Facility (SIMD) and
 	// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
-	if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
+	case CPU_Z14:
+#ifdef DYN_Z14
 		return &gotoblas_Z14;
 #endif
 
-#ifdef DYN_Z13
 	// z13: Vector Facility (SIMD for double)
-	if (hwcap & HWCAP_S390_VX)
+	case CPU_Z13:
+#ifdef DYN_Z13
 		return &gotoblas_Z13;
 #endif
 
+	default:
 	// fallback in case of missing compiler support, systems before z13, or
 	// when the OS does not advertise support for the Vector Facility (e.g.,
 	// missing support in the OS kernel)
-	return &gotoblas_ZARCH_GENERIC;
+		return &gotoblas_ZARCH_GENERIC;
+	}
 }
 
 static gotoblas_t* force_coretype(char* coretype) {
@@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) {
 
 	for (i = 0; i < NUM_CORETYPES; i++)
 	{
-		if (!strncasecmp(coretype, corename[i], 20))
+		if (!strncasecmp(coretype, cpuname[i], 20))
 		{
 			found = i;
 			break;
 		}
 	}
 
-	if (found == 1) {
+	if (found == CPU_Z13) {
 #ifdef DYN_Z13
 		return &gotoblas_Z13;
 #else
 		openblas_warning(1, "Z13 support not compiled in");
 		return NULL;
 #endif
-	} else if (found == 2) {
+	} else if (found == CPU_Z14) {
 #ifdef DYN_Z14
 		return &gotoblas_Z14;
 #else
 		openblas_warning(1, "Z14 support not compiled in");
 		return NULL;
 #endif
-	} else if (found == 3) {
+	} else if (found == CPU_GENERIC) {
 		return &gotoblas_ZARCH_GENERIC;
 	}
 
@@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) {
 	else
 	{
 		gotoblas = get_coretype();
+		if (openblas_verbose() >= 2) {
+			snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n",
+				 getauxval(AT_HWCAP));
+			openblas_warning(2, coremsg);
+		}
 	}
 
 	if (gotoblas == NULL)
@@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) {
 	}
 
 	if (gotoblas && gotoblas->init) {
-		strncpy(coren, gotoblas_corename(), 20);
-		sprintf(coremsg, "Core: %s\n", coren);
-		openblas_warning(2, coremsg);
+		if (openblas_verbose() >= 2) {
+			strncpy(coren, gotoblas_corename(), 20);
+			sprintf(coremsg, "Core: %s\n", coren);
+			openblas_warning(2, coremsg);
+		}
 		gotoblas->init();
 	}
 	else {
diff --git a/driver/others/memory.c b/driver/others/memory.c
index f0521ab2d..0f4cbb24d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#ifndef likely
+#ifdef __GNUC__
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
+#endif
+
 #if defined(USE_TLS) && defined(SMP)
 #define COMPILE_TLS
 
@@ -222,11 +232,11 @@ int get_num_procs(void);
 #else
 int get_num_procs(void) {
   static int nums = 0;
+  int ret;
+#if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
-  int ret;
 
-#if defined(__GLIBC_PREREQ)
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
@@ -236,6 +246,15 @@ int get_num_procs(void) {
 #endif
 
   if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+
+#if defined(USE_OPENMP)
+#if _OPENMP >= 201511
+    ret = omp_get_num_places();
+    if (ret >0 ) nums = ret;
+#endif
+    return nums;
+#endif
+
 #if !defined(OS_LINUX)
   return nums;
 #endif
@@ -428,7 +447,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -436,7 +455,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   max_num = get_num_procs();
 #endif
 
@@ -460,7 +479,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -1241,7 +1260,7 @@ UNLOCK_COMMAND(&alloc_lock);
 
       func = &memoryalloc[0];
 
-      while ((func != NULL) && (map_address == (void *) -1)) {
+      while ((*func != NULL) && (map_address == (void *) -1)) {
 
         map_address = (*func)((void *)base_address);
 
@@ -1291,7 +1310,12 @@ UNLOCK_COMMAND(&alloc_lock);
   return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
 
  error:
-  printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
+  printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
+  printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
+  printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
+  printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
+  printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
+  printf("cpu cores than what OpenBLAS was configured to handle.\n"); 
 
   return NULL;
 }
@@ -1619,10 +1643,12 @@ static int on_process_term(void)
 #else
 #pragma data_seg(".CRT$XLB")
 #endif
-static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
+
 #ifdef _WIN64
+static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #pragma const_seg()
 #else
+static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
 #pragma data_seg()
 #endif
 
@@ -1631,10 +1657,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
 #else
 #pragma data_seg(".CRT$XTU")
 #endif
-static int(*p_process_term)(void) = on_process_term;
+
 #ifdef _WIN64
+static const int(*p_process_term)(void) = on_process_term;
 #pragma const_seg()
 #else
+static int(*p_process_term)(void) = on_process_term;
 #pragma data_seg()
 #endif
 #endif
@@ -1668,16 +1696,23 @@ void gotoblas_dummy_for_PGI(void) {
 #ifndef MEM_LARGE_PAGES
 #define MEM_LARGE_PAGES  0x20000000
 #endif
-#else
+#elif !defined(OS_EMBEDDED)
 #define ALLOC_MMAP
 #define ALLOC_MALLOC
+#else
+#define ALLOC_MALLOC
+
+inline int puts(const char *str) { return 0; }
+inline int printf(const char *format, ...) { return 0; }
+inline char *getenv(const char *name) { return ""; }
+inline int atoi(const char *str) { return 0; }
 #endif
 
 #include <stdlib.h>
 #include <stdio.h>
 #include <fcntl.h>
 
-#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
+#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
 #include <sys/mman.h>
 #ifndef NO_SYSV_IPC
 #include <sys/shm.h>
@@ -1691,7 +1726,6 @@ void gotoblas_dummy_for_PGI(void) {
 #include <sys/sysinfo.h>
 #include <sched.h>
 #include <errno.h>
-#include <linux/unistd.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1767,11 +1801,12 @@ int get_num_procs(void);
 int get_num_procs(void) {
 
   static int nums = 0;
-
+  int ret;
+	
 #if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
-  int ret;
+
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)
@@ -1781,10 +1816,20 @@ int get_num_procs(void) {
 #endif
 
   if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+
+#if defined(USE_OPENMP)
+/*  if (omp_get_proc_bind() != omp_proc_bind_false) */
+#if _OPENMP >= 201511
+    ret = omp_get_num_places();
+    if (ret >0 ) nums = ret;
+#endif
+    return nums;
+#endif
+
 #if !defined(OS_LINUX)
   return nums;
 #endif
-
+	
 #if !defined(__GLIBC_PREREQ)
   return nums;
 #else
@@ -1969,7 +2014,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -1977,7 +2022,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   max_num = get_num_procs();
 #endif
 
@@ -2001,7 +2046,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -2045,6 +2090,7 @@ struct release_t {
 int hugetlb_allocated = 0;
 
 static struct release_t release_info[NUM_BUFFERS];
+static struct release_t *new_release_info;
 static int release_pos = 0;
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
@@ -2095,8 +2141,13 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
+    }
     release_pos ++;
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
@@ -2259,8 +2310,13 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
+    }
     release_pos ++;
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
@@ -2292,8 +2348,13 @@ static void *alloc_malloc(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_malloc_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_malloc_free;
+    }
     release_pos ++;
   }
 
@@ -2326,8 +2387,13 @@ static void *alloc_qalloc(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_qalloc_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_qalloc_free;
+    }
     release_pos ++;
   }
 
@@ -2355,8 +2421,13 @@ static void *alloc_windows(void *address){
   if (map_address == (void *)NULL) map_address = (void *)-1;
 
   if (map_address != (void *)-1) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_windows_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_windows_free;
+    }
     release_pos ++;
   }
 
@@ -2399,9 +2470,15 @@ static void *alloc_devicedirver(void *address){
                      fd, 0);
 
   if (map_address != (void *)-1) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = fd;
     release_info[release_pos].func    = alloc_devicedirver_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].attr    = fd;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_devicedirver_free;
+    }
     release_pos ++;
   }
 
@@ -2435,9 +2512,15 @@ static void *alloc_shm(void *address){
 
     shmctl(shmid, IPC_RMID, 0);
 
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = shmid;
     release_info[release_pos].func    = alloc_shm_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].attr    = shmid;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_shm_free;
+    }
     release_pos ++;
   }
 
@@ -2541,8 +2624,13 @@ static void *alloc_hugetlb(void *address){
 #endif
 
   if (map_address != (void *)-1){
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_hugetlb_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_hugetlb_free;
+    }
     release_pos ++;
   }
 
@@ -2589,9 +2677,15 @@ static void *alloc_hugetlbfile(void *address){
                      fd, 0);
 
   if (map_address != (void *)-1) {
+    if (likely(release_pos < NUM_BUFFERS)) {
     release_info[release_pos].address = map_address;
     release_info[release_pos].attr    = fd;
     release_info[release_pos].func    = alloc_hugetlbfile_free;
+    } else {
+    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
+    new_release_info[release_pos-NUM_BUFFERS].attr    = fd;
+    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_hugetlbfile_free;
+    }
     release_pos ++;
   }
 
@@ -2621,8 +2715,25 @@ static volatile struct {
 
 } memory[NUM_BUFFERS];
 
-static int memory_initialized = 0;
+struct newmemstruct 
+{
+  BLASULONG lock;
+  void *addr;
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+  int   pos;
+#endif
+  int used;
+#ifndef __64BIT__
+  char dummy[48];
+#else
+  char dummy[40];
+#endif
 
+};
+static volatile struct newmemstruct *newmemory;
+
+static int memory_initialized = 0;
+static int memory_overflowed = 0;
 /*       Memory allocation routine           */
 /* procpos ... indicates where it comes from */
 /*                0 : Level 3 functions      */
@@ -2631,6 +2742,8 @@ static int memory_initialized = 0;
 
 void *blas_memory_alloc(int procpos){
 
+  int i;
+  
   int position;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
   int mypos = 0;
@@ -2761,6 +2874,25 @@ void *blas_memory_alloc(int procpos){
     position ++;
 
   } while (position < NUM_BUFFERS);
+
+  if (memory_overflowed) {
+
+    do {
+      RMB;
+#if defined(USE_OPENMP)
+      if (!newmemory[position-NUM_BUFFERS].used) {
+        blas_lock(&newmemory[position-NUM_BUFFERS].lock);
+#endif
+        if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
+
+#if defined(USE_OPENMP)
+        blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
+      }
+#endif
+      position ++;
+
+    } while (position < 512+NUM_BUFFERS);
+  }
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #endif
@@ -2788,7 +2920,7 @@ void *blas_memory_alloc(int procpos){
 
       func = &memoryalloc[0];
 
-      while ((func != NULL) && (map_address == (void *) -1)) {
+      while ((*func != NULL) && (map_address == (void *) -1)) {
 
         map_address = (*func)((void *)base_address);
 
@@ -2868,8 +3000,102 @@ void *blas_memory_alloc(int procpos){
   return (void *)memory[position].addr;
 
  error:
-  printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  LOCK_COMMAND(&alloc_lock);
+#endif
+ if (memory_overflowed) goto terminate;
+  fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
+  memory_overflowed=1;
+  new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
+  newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
+  for (i = 0; i < 512; i++) {
+  newmemory[i].addr   = (void *)0;
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+  newmemory[i].pos    = -1;
+#endif
+  newmemory[i].used   = 0;
+  newmemory[i].lock   = 0;
+}
+  
+allocation2:
+  newmemory[position-NUM_BUFFERS].used = 1;
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  UNLOCK_COMMAND(&alloc_lock);
+#else
+  blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
+#endif
+    do {
+#ifdef DEBUG
+      printf("Allocation Start : %lx\n", base_address);
+#endif
+
+      map_address = (void *)-1;
+
+      func = &memoryalloc[0];
+
+      while ((*func != NULL) && (map_address == (void *) -1)) {
+
+        map_address = (*func)((void *)base_address);
+
+#ifdef ALLOC_DEVICEDRIVER
+        if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
+            fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
+        }
+#endif
+
+#ifdef ALLOC_HUGETLBFILE
+        if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
+#ifndef OS_WINDOWS
+            fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
+#endif
+        }
+#endif
+
+#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
+        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
+#endif
+
+        func ++;
+      }
 
+#ifdef DEBUG
+      printf("  Success -> %08lx\n", map_address);
+#endif
+      if (((BLASLONG) map_address) == -1) base_address = 0UL;
+
+      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
+
+    } while ((BLASLONG)map_address == -1);
+
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+    LOCK_COMMAND(&alloc_lock);
+#endif
+    newmemory[position-NUM_BUFFERS].addr = map_address;
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+    UNLOCK_COMMAND(&alloc_lock);
+#endif
+
+#ifdef DEBUG
+    printf("  Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
+#endif
+
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+
+  if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
+
+#endif
+  return (void *)newmemory[position-NUM_BUFFERS].addr;
+
+terminate:
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+    UNLOCK_COMMAND(&alloc_lock);
+#endif
+  printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
+  printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
+  printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
+  printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
+  printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
+  printf("cpu cores than what OpenBLAS was configured to handle.\n"); 
   return NULL;
 }
 
@@ -2888,13 +3114,28 @@ void blas_memory_free(void *free_area){
   while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
     position++;
 
-  if (position >= NUM_BUFFERS) goto error;
+  if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
 
 #ifdef DEBUG
   if (memory[position].addr != free_area) goto error;
   printf("  Position : %d\n", position);
 #endif
+  if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
+    while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
+      position++;
+  // arm: ensure all writes are finished before other thread takes this memory
+  WMB;
+
+  newmemory[position].used = 0;
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
+  UNLOCK_COMMAND(&alloc_lock);
+#endif
 
+#ifdef DEBUG
+  printf("Unmap from overflow area succeeded.\n\n");
+#endif
+  return;
+} else {
   // arm: ensure all writes are finished before other thread takes this memory
   WMB;
 
@@ -2908,7 +3149,7 @@ void blas_memory_free(void *free_area){
 #endif
 
   return;
-
+}
  error:
   printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
 
@@ -2943,7 +3184,10 @@ void blas_shutdown(void){
   LOCK_COMMAND(&alloc_lock);
 
   for (pos = 0; pos < release_pos; pos ++) {
+    if (likely(pos < NUM_BUFFERS))
     release_info[pos].func(&release_info[pos]);
+    else
+    new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
   }
 
 #ifdef SEEK_ADDRESS
@@ -2960,6 +3204,15 @@ void blas_shutdown(void){
 #endif
     memory[pos].lock   = 0;
   }
+  if (memory_overflowed)
+    for (pos = 0; pos < 512; pos ++){
+      newmemory[pos].addr   = (void *)0;
+      newmemory[pos].used   = 0;
+#if defined(WHEREAMI) && !defined(USE_OPENMP)
+      newmemory[pos].pos    = -1;
+#endif
+      newmemory[pos].lock   = 0;
+  }
 
   UNLOCK_COMMAND(&alloc_lock);
 
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 35fc0a253..0d5c6aec0 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -183,7 +183,7 @@ int get_L2_size(void){
     defined(CORE_PRESCOTT) || defined(CORE_CORE2)       || defined(PENRYN) || defined(DUNNINGTON) || \
     defined(CORE_NEHALEM)  || defined(CORE_SANDYBRIDGE) || defined(ATOM)   || defined(GENERIC)    || \
     defined(PILEDRIVER)    || defined(HASWELL)          || defined(STEAMROLLER) || defined(EXCAVATOR) || \
-    defined(ZEN)           || defined(SKYLAKEX)         || defined(COOPERLAKE)
+    defined(ZEN)           || defined(SKYLAKEX)         || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 
   cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
 
@@ -269,7 +269,7 @@ void blas_set_parameter(void){
   int factor;
 #if defined(BULLDOZER) || defined(PILEDRIVER)  || defined(SANDYBRIDGE) || defined(NEHALEM) || \
     defined(HASWELL)   || defined(STEAMROLLER) || defined(EXCAVATOR)   || defined(ZEN)     || \
-    defined(SKYLAKEX)  || defined(COOPERLAKE)
+    defined(SKYLAKEX)  || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
   int size = 16;
 #else
   int size = get_L2_size();
@@ -524,6 +524,9 @@ void blas_set_parameter(void){
   xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
 #endif
 
+#ifdef BUILD_BFLOAT16
+  sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
+#endif
   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
@@ -629,7 +632,9 @@ void blas_set_parameter(void){
   xgemm_p =  16 * (size + 1);
 #endif
 
+#ifdef BUILD_BFLOAT16
   sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
+#endif
   sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
   dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
   cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
@@ -717,7 +722,7 @@ void blas_set_parameter(void){
 
 #if defined(ARCH_MIPS64)
 void blas_set_parameter(void){
-#if defined(LOONGSON3A)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #ifdef SMP
   if(blas_num_threads == 1){
 #endif
@@ -731,20 +736,6 @@ void blas_set_parameter(void){
 #endif
 #endif
 
-#if defined(LOONGSON3B)
-#ifdef SMP
-  if(blas_num_threads == 1 || blas_num_threads == 2){
-#endif
-    //single thread
-    dgemm_r = 640;
-#ifdef SMP
-  }else{
-    //multi thread
-    dgemm_r = 160;
-  }
-#endif
-#endif
-
 }
 #endif
 
diff --git a/exports/Makefile b/exports/Makefile
index eec0593aa..baaa33623 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -139,9 +139,17 @@ endif
 ifneq (,$(filter 1 2,$(NOFORTRAN)))
 #only build without Fortran
 	$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
+else
+ifeq ($(F_COMPILER), INTEL)
+	$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
+else
+ifeq ($(F_COMPILER), FLANG)
+	$(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 else
 	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 endif
+endif
+endif
 
 dllinit.$(SUFFIX) : dllinit.c
 	$(CC) $(CFLAGS) -c -o $(@F) -s $<
diff --git a/exports/gensymbol b/exports/gensymbol
index 857a17a9e..e7210a030 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 # Changelog
 # 2017/09/03 staticfloat
diff --git a/f_check b/f_check
index 42241ae10..71293b53f 100644
--- a/f_check
+++ b/f_check
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 $hostos   = `uname -s | sed -e s/\-.*//`;    chop($hostos);
 
@@ -32,9 +32,9 @@ if ($compiler eq "") {
               "xlf95", "xlf90", "xlf",
               "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
 	      "pathf90", "pathf95",
-	      "pgf95", "pgf90", "pgf77",
+	      "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
 	      "flang", "egfortran",
-              "ifort");
+              "ifort", "nagfor");
 
 OUTER:
     foreach $lists (@lists) {
@@ -64,7 +64,9 @@ if ($compiler eq "") {
     if (!$?) {
 
 	$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
-
+	if ($data eq "") {
+		$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
+	}
 	if ($data =~ /zhoge_/) {
 	    $bu       = "_";
 	}
@@ -76,6 +78,7 @@ if ($compiler eq "") {
 
 	} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
 
+            $data =~ s/\(+.*?\)+//g;
 	    $data =~ /(\d+)\.(\d+).(\d+)/;
 	    $major = $1;
 	    $minor = $2;
@@ -87,7 +90,7 @@ if ($compiler eq "") {
 		if ($compiler =~ /flang/) {
 		    $vendor = FLANG;
 		    $openmp = "-fopenmp";
-	    } elsif ($compiler =~ /pgf/) {
+	    } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
 		    $vendor = PGI;
 		    $openmp = "-mp";
 		} else {
@@ -123,7 +126,7 @@ if ($compiler eq "") {
 	    $openmp = "-mp";
 	}
 
-	if ($data =~ /PGF/) {
+	if ($data =~ /PGF/ || $data =~ /NVF/) {
 	    $vendor = PGI;
 	    $openmp = "-mp";
 	}
@@ -133,8 +136,16 @@ if ($compiler eq "") {
 	    $openmp = "-openmp";
 	}
 
+	if ($data =~ /NAG/) {
+	    $vendor = NAG;
+	    $openmp = "-openmp";
+	}
+
 	# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
 	$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
+	if ($data eq "") {
+		$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
+	}
 	if ($data =~ / zho_ge__/) {
 	    $need2bu       = 1;
 	}
@@ -177,7 +188,7 @@ if ($compiler eq "") {
 	    $openmp = "-mp";
 	}
 
-	if ($compiler =~ /pgf/) {
+	if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
 	    $vendor = PGI;
 	    $bu       = "_";
 	    $openmp = "-mp";
@@ -222,6 +233,12 @@ if ($compiler eq "") {
 	    $openmp = "-fopenmp";
 	}
 
+	if ($compiler =~ /nagfor/) {
+	    $vendor = NAG;
+	    $bu     = "_";
+	    $openmp = "-openmp";
+	}
+
 	if ($vendor eq "") {
 	    $nofortran = 1;
 	    $compiler = "gfortran";
@@ -275,14 +292,20 @@ if (!$?) {
 	if ($?) {
 	    $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
 	}
+       #For nagfor
+	if ($?) {
+	    $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
+        }
 	$binary = "" if ($?);
     }
-
     if ($binary eq "") {
 	$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
     }
 }
 
+if ( $vendor eq "NAG") {
+	    $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
+    }
 $linker_L = "";
 $linker_l = "";
 $linker_a = "";
@@ -291,11 +314,11 @@ if ($link ne "") {
 
     $link =~ s/\-Y\sP\,/\-Y/g;
     
-    $link =~ s/\-R\s*/\-rpath\@/g;
+    $link =~ s/\-R\s*/\-rpath\%/g;
 
-    $link =~ s/\-rpath\s+/\-rpath\@/g;
+    $link =~ s/\-rpath\s+/\-rpath\%/g;
 
-    $link =~ s/\-rpath-link\s+/\-rpath-link\@/g;
+    $link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
 
     @flags = split(/[\s\,\n]/, $link);
     # remove leading and trailing quotes from each flag.
@@ -321,22 +344,24 @@ if ($link ne "") {
 	}
 
 
-	if ($flags =~ /^\-rpath\@/) {
-	    $flags =~ s/\@/\,/g;
+	if ($flags =~ /^\-rpath\%/) {
+	    $flags =~ s/\%/\,/g;
 	    $linker_L .= "-Wl,". $flags . " " ;
 	}
 
-	if ($flags =~ /^\-rpath-link\@/) {
-	    $flags =~ s/\@/\,/g;
+	if ($flags =~ /^\-rpath-link\%/) {
+	    $flags =~ s/\%/\,/g;
 	    $linker_L .= "-Wl,". $flags . " " ;
 	}
-	if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
+	if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
 	    $flags = "-lomp";
 	}
 
 	if (
 	    ($flags =~ /^\-l/)
+	    && ($flags !~ /ibrary/)
 	    && ($flags !~ /gfortranbegin/)
+	    && ($flags !~ /flangmain/)
 	    && ($flags !~ /frtbegin/)
 	    && ($flags !~ /pathfstart/)
 	    && ($flags !~ /crt[0-9]/)
@@ -352,15 +377,21 @@ if ($link ne "") {
 	    $linker_l .= $flags . " ";
 	}
 
+	if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
+	    $linker_l .= $flags . " ";
+	}
+	if ( $flags =~ /safefit.o/ && $vendor == NAG) {
+	    $linker_l .= $flags . " ";
+	}
+	if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
+	    $linker_l .= $flags . " ";
+	}
+
 	$linker_a .= $flags . " " if $flags =~ /\.a$/;
     }
 
 }
 
-if ($vendor eq "INTEL"){
-    $linker_a .= "-lgfortran"
-}
-
 if ($vendor eq "FLANG"){
     $linker_a .= "-lflang"
 }
diff --git a/getarch.c b/getarch.c
index 9344defb5..00e544bc7 100644
--- a/getarch.c
+++ b/getarch.c
@@ -140,8 +140,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_PPC440FP2	*/
 /* #define FORCE_CELL		*/
 /* #define FORCE_SICORTEX	*/
-/* #define FORCE_LOONGSON3A	*/
-/* #define FORCE_LOONGSON3B	*/
+/* #define FORCE_LOONGSON3R3	*/
+/* #define FORCE_LOONGSON3R4	*/
+/* #define FORCE_LOONGSON3R5	*/
 /* #define FORCE_I6400		*/
 /* #define FORCE_P6600		*/
 /* #define FORCE_P5600		*/
@@ -312,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX 
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
 #define SUBARCHITECTURE "SANDYBRIDGE"
 #define ARCHCONFIG   "-DSANDYBRIDGE " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -321,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "sandybridge"
 #define CORENAME  "SANDYBRIDGE"
 #endif
+#endif
 
 #ifdef FORCE_HASWELL
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
 #ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
 #define SUBARCHITECTURE "SANDYBRIDGE"
 #define ARCHCONFIG   "-DSANDYBRIDGE " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -335,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
 #define LIBNAME   "sandybridge"
 #define CORENAME  "SANDYBRIDGE"
+#endif
 #else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
@@ -349,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef FORCE_SKYLAKEX
-#ifdef NO_AVX512
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX512
+#ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#endif
+#else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -362,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
+#endif
 #else
-#define FORCE
-#define FORCE_INTEL
-#define ARCHITECTURE    "X86"
 #define SUBARCHITECTURE "SKYLAKEX"
 #define ARCHCONFIG   "-DSKYLAKEX " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -379,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef FORCE_COOPERLAKE
-#ifdef NO_AVX512
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX512
+#ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#endif
+#else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
                      "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -392,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
+#endif
 #else
-#define FORCE
-#define FORCE_INTEL
-#define ARCHITECTURE    "X86"
 #define SUBARCHITECTURE "COOPERLAKE"
 #define ARCHCONFIG   "-DCOOPERLAKE " \
                      "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -408,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
+#ifdef FORCE_SAPPHIRERAPIDS
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#ifdef NO_AVX512
+#ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#endif
+#else
+#define SUBARCHITECTURE "HASWELL"
+#define ARCHCONFIG   "-DHASWELL " \
+                     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+                     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+                     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+                     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
+#define LIBNAME   "haswell"
+#define CORENAME  "HASWELL"
+#endif
+#else
+#define SUBARCHITECTURE "SAPPHIRERAPIDS"
+#define ARCHCONFIG   "-DSAPPHIRERAPIDS " \
+                     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+                     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+                     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+                     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids"
+#define LIBNAME   "sapphirerapids"
+#define CORENAME  "SAPPHIRERAPIDS"
+#endif
+#endif
+
 #ifdef FORCE_ATOM
 #define FORCE
 #define FORCE_INTEL
@@ -563,6 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
 #ifdef NO_AVX2
+#ifdef NO_AVX
+#define SUBARCHITECTURE "NEHALEM"
+#define ARCHCONFIG   "-DNEHALEM " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
+#define LIBNAME   "nehalem"
+#define CORENAME  "NEHALEM"
+#else
 #define SUBARCHITECTURE "SANDYBRIDGE"
 #define ARCHCONFIG   "-DSANDYBRIDGE " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -571,6 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
 #define LIBNAME   "sandybridge"
 #define CORENAME  "SANDYBRIDGE"
+#endif
 #else
 #define SUBARCHITECTURE "ZEN"
 #define ARCHCONFIG   "-DZEN " \
@@ -814,31 +935,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-#ifdef FORCE_LOONGSON3A
+#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B
 #define FORCE
 #define ARCHITECTURE    "MIPS"
-#define SUBARCHITECTURE "LOONGSON3A"
+#define SUBARCHITECTURE "LOONGSON3R3"
 #define SUBDIRNAME      "mips64"
-#define ARCHCONFIG   "-DLOONGSON3A " \
+#define ARCHCONFIG   "-DLOONGSON3R3 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "loongson3a"
-#define CORENAME  "LOONGSON3A"
+#define LIBNAME   "loongson3r3"
+#define CORENAME  "LOONGSON3R3"
 #else
 #endif
 
-#ifdef FORCE_LOONGSON3B
+#ifdef FORCE_LOONGSON3R4
 #define FORCE
 #define ARCHITECTURE    "MIPS"
-#define SUBARCHITECTURE "LOONGSON3B"
+#define SUBARCHITECTURE "LOONGSON3R4"
 #define SUBDIRNAME      "mips64"
-#define ARCHCONFIG   "-DLOONGSON3B " \
+#define ARCHCONFIG   "-DLOONGSON3R4 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "loongson3b"
-#define CORENAME  "LOONGSON3B"
+#define LIBNAME   "loongson3r4"
+#define CORENAME  "LOONGSON3R4"
+#else
+#endif
+
+#ifdef FORCE_LOONGSON3R5
+#define FORCE
+#define ARCHITECTURE    "LOONGARCH"
+#define SUBARCHITECTURE "LOONGSON3R5"
+#define SUBDIRNAME      "loongarch64"
+#define ARCHCONFIG   "-DLOONGSON3R5 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
+#define LIBNAME   "loongson3r5"
+#define CORENAME  "LOONGSON3R5"
 #else
 #endif
 
@@ -878,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ARCHCONFIG   "-DP5600 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
 #define LIBNAME   "p5600"
 #define CORENAME  "P5600"
 #else
@@ -892,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ARCHCONFIG   "-DMIPS1004K " \
        "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
 #define LIBNAME   "mips1004K"
 #define CORENAME  "MIPS1004K"
 #else
@@ -906,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ARCHCONFIG   "-DMIPS24K " \
        "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
 #define LIBNAME   "mips24K"
 #define CORENAME  "MIPS24K"
 #else
@@ -1063,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_ARMV8SVE
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "ARMV8SVE"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DARMV8SVE " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
+#define LIBNAME   "armv8sve"
+#define CORENAME  "ARMV8SVE"
+#endif
+
 
 #ifdef FORCE_ARMV8
 #define FORCE
@@ -1153,12 +1302,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
        "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \
-       "-march=armv8.2-a -mtune=cortex-a72"
+       "-march=armv8.2-a -mtune=neoverse-n1"
 #define LIBNAME   "neoversen1"
 #define CORENAME  "NEOVERSEN1"
 #else
 #endif
 
+#ifdef FORCE_NEOVERSEV1
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "NEOVERSEV1"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DNEOVERSEV1 " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
+       "-march=armv8.4-a -mtune=neoverse-v1"
+#define LIBNAME   "neoversev1"
+#define CORENAME  "NEOVERSEV1"
+#else
+#endif
+
+
+#ifdef FORCE_NEOVERSEN2
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "NEOVERSEN2"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DNEOVERSEN2 " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
+       "-march=armv8.5-a -mtune=neoverse-n2"
+#define LIBNAME   "neoversen2"
+#define CORENAME  "NEOVERSEN2"
+#else
+#endif
+
+#ifdef FORCE_CORTEXA55
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "CORTEXA55"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DCORTEXA55 " \
+       "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+       "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+       "-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "cortexa55"
+#define CORENAME  "CORTEXA55"
+#else
+#endif
 
 #ifdef FORCE_FALKOR
 #define FORCE
@@ -1274,6 +1473,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "VORTEX"
 #endif
 
+#ifdef FORCE_A64FX
+#define ARMV8
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "A64FX"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DA64FX " \
+       "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \
+       "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \
+       "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
+#define LIBNAME   "a64fx"
+#define CORENAME  "A64FX"
+#else
+#endif
+
 #ifdef FORCE_ZARCH_GENERIC
 #define FORCE
 #define ARCHITECTURE    "ZARCH"
@@ -1319,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+#if defined(FORCE_E2K) || defined(__e2k__)
+#define FORCE
+#define ARCHITECTURE "E2K"
+#define ARCHCONFIG   "-DGENERIC " \
+		     "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "generic"
+#define CORENAME  "generic"
+#endif
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1373,8 +1601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __loongarch64
+#include "cpuid_loongarch64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
 #ifdef __riscv
 #include "cpuid_riscv64.c"
+#define OPENBLAS_SUPPORTED
 #endif
 
 #ifdef __arm__
@@ -1447,7 +1681,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -1595,7 +1829,7 @@ printf("ELF_VERSION=2\n");
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
diff --git a/getarch_2nd.c b/getarch_2nd.c
index c390ef52c..dd1f83089 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -4,6 +4,14 @@
 #else
 #include "config_kernel.h"
 #endif
+#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
 #include "param.h"
 
 int main(int argc, char **argv) {
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 5346ecadd..0b2998237 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES
 # these all have 'z' sources for complex versions
 set(BLAS2_SOURCES
   gemv.c ger.c
-  trsv.c trmv.c symv.c
-  syr.c syr2.c gbmv.c
-  sbmv.c spmv.c
-  spr.c spr2.c
+  trsv.c trmv.c 
+  syr2.c gbmv.c
+  sbmv.c 
+  spr2.c
   tbsv.c tbmv.c
   tpsv.c tpmv.c
 )
 
+set(BLAS2_REAL_ONLY_SOURCES
+  symv.c syr.c spmv.c spr.c
+)
+set(BLAS2_COMPLEX_LAPACK_SOURCES
+  symv.c syr.c spmv.c spr.c
+)
+
 set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
   hemv.c hbmv.c
   her.c her2.c
@@ -78,10 +85,15 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
   GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1)
   GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
   GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
+  GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1)
+  if (NOT DEFINED NO_LAPACK)
+  GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
+  endif ()
   GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4)
   GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
   GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
 
+  GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true)
   #sdsdot, dsdot
   if (BUILD_SINGLE OR BUILD_DOUBLE)
   GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE")
@@ -104,6 +116,15 @@ endif ()
   GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG})
   GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG})
 
+if (BUILD_BFLOAT16)
+	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+endif ()
 
 # complex-specific sources
 foreach (float_type ${FLOAT_TYPES})
diff --git a/interface/Makefile b/interface/Makefile
index 597956fdb..f57d0bda0 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -316,7 +316,7 @@ CCBLAS1OBJS   = \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
 	cblas_caxpby.$(SUFFIX) \
-	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
+	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
 
 CCBLAS2OBJS   = \
 	cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -346,7 +346,7 @@ CZBLAS1OBJS   = \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
 	cblas_zaxpby.$(SUFFIX) \
-	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
+	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
 
 
 CZBLAS2OBJS   = \
@@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c
 qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c
 qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c
 qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c
 qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+ifndef NO_LAPACK
 cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+endif
 
 xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1634,6 +1642,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
 cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
+cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c
+	$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
+cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c
+	$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
 cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1664,6 +1678,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
 cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
 
+cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c
+	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
+cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c
+	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
+
 ifeq ($(BUILD_BFLOAT16),1)
 cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
diff --git a/interface/axpy.c b/interface/axpy.c
index eaa19f4df..5304ebec3 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
 #endif
 
     blas_level1_thread(mode, n, 0, 0, &alpha,
-		       x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads);
+		       x, incx, y, incy, NULL, 0,  (int (*)(void))AXPYU_K, nthreads);
 
   }
 #endif
diff --git a/interface/create b/interface/create
index b7be8ab6e..0b9cefa2b 100755
--- a/interface/create
+++ b/interface/create
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 
 $count = 0;
 
diff --git a/interface/gemm.c b/interface/gemm.c
index 860e588fe..71cc77a1b 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -49,6 +49,8 @@
 #define ERROR_NAME "QGEMM "
 #elif defined(DOUBLE)
 #define ERROR_NAME "DGEMM "
+#elif defined(BFLOAT16)
+#define ERROR_NAME "SBGEMM "
 #else
 #define ERROR_NAME "SGEMM "
 #endif
@@ -103,6 +105,55 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
 #endif
 };
 
+#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE)
+#define USE_SMALL_MATRIX_OPT 1
+#else
+#define USE_SMALL_MATRIX_OPT 0
+#endif
+
+#if USE_SMALL_MATRIX_OPT
+#ifndef DYNAMIC_ARCH
+#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx]))
+#else
+#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx]))))
+#endif
+
+
+#ifndef COMPLEX
+static size_t gemm_small_kernel[] = {
+	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0,
+	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0,
+};
+
+
+static size_t gemm_small_kernel_b0[] = {
+	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0,
+	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0,
+};
+
+#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
+#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx))
+#else
+
+static size_t zgemm_small_kernel[] = {
+	GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
+	GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
+	GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
+	GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
+};
+
+static size_t zgemm_small_kernel_b0[] = {
+	GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
+	GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
+	GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
+	GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
+};
+
+#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx))
+#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx))
+#endif
+#endif
+
 #ifndef CBLAS
 
 void NAME(char *TRANSA, char *TRANSB,
@@ -124,6 +175,7 @@ void NAME(char *TRANSA, char *TRANSB,
 
 #ifdef SMP
   double MNK;
+#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
 #ifndef COMPLEX
 #ifdef XDOUBLE
   int mode  =  BLAS_XDOUBLE | BLAS_REAL;
@@ -142,6 +194,7 @@ void NAME(char *TRANSA, char *TRANSB,
 #endif
 #endif
 #endif
+#endif
 
 #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
   int nodes;
@@ -220,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	   blasint m, blasint n, blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
-	   FLOAT *a, blasint lda,
-	   FLOAT *b, blasint ldb,
+	   IFLOAT *a, blasint lda,
+	   IFLOAT *b, blasint ldb,
 	   FLOAT beta,
 	   FLOAT *c, blasint ldc) {
 #else
@@ -246,6 +299,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
 #ifdef SMP
   double MNK;
+#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
 #ifndef COMPLEX
 #ifdef XDOUBLE
   int mode  =  BLAS_XDOUBLE | BLAS_REAL;
@@ -264,6 +318,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 #endif
 #endif
 #endif
+#endif
 
 #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
   int nodes;
@@ -271,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   PRINT_DEBUG_CNAME;
 
-#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
+#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
 #ifdef DYNAMIC_ARCH
  if (support_avx512() )
 #endif  
@@ -411,14 +466,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
   FUNCTION_PROFILE_START();
 
+#if USE_SMALL_MATRIX_OPT
+#if !defined(COMPLEX)
+  if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
+	  if(*(FLOAT *)(args.beta) == 0.0){
+		(GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
+	  }else{
+		(GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
+	  }
+	  return;
+  }
+#else
+  if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){
+	  if(beta[0] == 0.0 && beta[1] == 0.0){
+		(ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
+	  }else{
+		(ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
+	  }
+	  return;
+  }
+#endif
+#endif
+
   buffer = (XFLOAT *)blas_memory_alloc(0);
 
   sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
   sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 
 #ifdef SMP
+#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
   mode |= (transa << BLAS_TRANSA_SHIFT);
   mode |= (transb << BLAS_TRANSB_SHIFT);
+#endif
 
   MNK = (double) args.m * (double) args.n * (double) args.k;
   if ( MNK <= (SMP_THRESHOLD_MIN  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
diff --git a/interface/gemv.c b/interface/gemv.c
index d5d739fb1..1f0763579 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -201,7 +201,14 @@ void CNAME(enum CBLAS_ORDER order,
   if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
 
   if (alpha == ZERO) return;
-
+	
+#if 0
+/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */	
+  if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) {
+    GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL);
+    return;
+  }    
+#endif
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/interface/ger.c b/interface/ger.c
index 8cf1614e3..af6ae8606 100644
--- a/interface/ger.c
+++ b/interface/ger.c
@@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order,
   if (m == 0 || n == 0) return;
   if (alpha == 0.) return;
 
+  if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) {
+    GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL);
+    return;
+  }  
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 93ffd69f9..91975f7f4 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 
 	if ( *lda >  *ldb )
-		msize = (*lda) * (*ldb)  * sizeof(FLOAT);
+		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
 	else
-		msize = (*ldb) * (*ldb)  * sizeof(FLOAT);
+		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c
index 02bb124b3..323370ebc 100644
--- a/interface/lapack/getrf.c
+++ b/interface/lapack/getrf.c
@@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
 
 #ifdef SMP
   args.common = NULL;
-  args.nthreads = num_cpu_avail(4);
+#ifndef DOUBLE
+  if (args.m*args.n < 40000)
+#else
+  if (args.m*args.n < 10000)
+#endif
+	args.nthreads=1;
+  else
+	args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
 #endif
diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c
index dbd55f62f..3abc80133 100644
--- a/interface/lapack/potrf.c
+++ b/interface/lapack/potrf.c
@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
 
 #ifdef SMP
   args.common = NULL;
+#ifndef DOUBLE
+  if (args.n <128)
+#else
+  if (args.n <64)
+#endif
+    args.nthreads = 1;
+  else
   args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c
index 2c0c64b6f..eb0fcbe70 100644
--- a/interface/lapack/potri.c
+++ b/interface/lapack/potri.c
@@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
 
 #ifdef SMP
   args.common = NULL;
+  if (args.n < 180)
+    args.nthreads = 1;
+  else
   args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c
index 7f8db94f6..d03541fad 100644
--- a/interface/lapack/zgetrf.c
+++ b/interface/lapack/zgetrf.c
@@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
 
 #ifdef SMP
   args.common = NULL;
-  args.nthreads = num_cpu_avail(4);
+  if (args.m*args.n <10000)
+	args.nthreads = 1;
+  else
+	args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
 #endif
diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c
index c4cd99bf6..298efbbc1 100644
--- a/interface/lapack/zpotrf.c
+++ b/interface/lapack/zpotrf.c
@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
 
 #ifdef SMP
   args.common = NULL;
+#ifndef DOUBLE
+  if (args.n < 64)
+#else
+  if (args.n < 64)
+#endif
+    args.nthreads = 1;
+  else
   args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c
index 8da211683..8748c6352 100644
--- a/interface/lapack/zpotri.c
+++ b/interface/lapack/zpotri.c
@@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
 
 #ifdef SMP
   args.nthreads = num_cpu_avail(4);
+#ifndef DOUBLE
+  if (args.n < 200)
+#else
+  if (args.n < 150)
+#endif
+	  args.nthreads=1;
+  else
+#endif
+  args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
 #endif
diff --git a/interface/rotmg.c b/interface/rotmg.c
index ce3b146c1..3a5ca8f95 100644
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 		dq1 =  dp1 * *dx1;
 		if(ABS(dq1) > ABS(dq2))
 		{
-			dflag = ZERO;
 			dh11  =  ONE;
 			dh22  =  ONE;
 			dh21 = -  dy1 / *dx1;
diff --git a/interface/scal.c b/interface/scal.c
index 6d07b1650..0a7fee640 100644
--- a/interface/scal.c
+++ b/interface/scal.c
@@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
 #else
 		       &alpha,
 #endif
-		       x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
+		       x, incx, NULL, 0, NULL, 0,  (int (*)(void))SCAL_K, nthreads);
 
   }
 #endif
diff --git a/interface/spr.c b/interface/spr.c
index 1956986e9..8aafc9f85 100644
--- a/interface/spr.c
+++ b/interface/spr.c
@@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order,
 
   FUNCTION_PROFILE_START();
 
+  if (incx == 1 && n <100) {
+    blasint i;
+    if (uplo==0) {
+      for (i = 0; i < n; i++){
+        if (x[i] != ZERO) {
+          AXPYU_K(i + 1, 0, 0, alpha * x[i], x,     1, a, 1, NULL, 0);
+        }
+        a += i + 1;
+      }
+    } else { 
+      for (i = 0; i < n; i++){
+        if (x[i] != ZERO) {
+          AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
+        }
+        a += n - i;
+      }
+    }
+    return;
+  }
+
   if (incx < 0 ) x -= (n - 1) * incx;
 
   buffer = (FLOAT *)blas_memory_alloc(1);
diff --git a/interface/spr2.c b/interface/spr2.c
index 73a811c3e..b5aab1767 100644
--- a/interface/spr2.c
+++ b/interface/spr2.c
@@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order,
 
   if (alpha == ZERO) return;
 
+  if (incx == 1 && incy == 1 && n < 50) {
+    blasint i;
+    if (!uplo) {
+      for (i = 0; i < n; i++){
+        AXPYU_K(i + 1, 0, 0, alpha * x[i], y,     1, a, 1, NULL, 0);
+        AXPYU_K(i + 1, 0, 0, alpha * y[i], x,     1, a, 1, NULL, 0);
+        a += i + 1;
+      }
+    } else {
+      for (i = 0; i < n; i++){
+	AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
+        AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
+        a += n - i;
+      }
+    }
+    return;
+  }
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/interface/syr.c b/interface/syr.c
index 1374bcc69..ad75264b1 100644
--- a/interface/syr.c
+++ b/interface/syr.c
@@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
-
+#if 1
+  if (incx == 1 && n < 100) {
+    BLASLONG i;
+
+    if (uplo == 0) {
+      for (i = 0; i < n; i++){
+        if (x[i] != ZERO) {
+          AXPYU_K(i + 1, 0, 0, alpha * x[i], x,     1, a, 1, NULL, 0);
+        }
+        a += lda;
+      }  
+    } else {
+      for (i = 0; i < n; i++){
+        if (x[i] != ZERO) {
+          AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
+        }
+        a += 1 + lda;
+      }
+    }
+    return;
+  } 
+#endif
   if (incx < 0 ) x -= (n - 1) * incx;
 
   buffer = (FLOAT *)blas_memory_alloc(1);
diff --git a/interface/syr2.c b/interface/syr2.c
index 08fd47e57..632906d28 100644
--- a/interface/syr2.c
+++ b/interface/syr2.c
@@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
 
   IDEBUG_START;
 
+  if (incx == 1 && incy == 1 && n < 100) {
+    blasint i;
+    if (!uplo) {
+      for (i = 0; i < n; i++){
+        AXPYU_K(i + 1, 0, 0, alpha * x[i], y,     1, a, 1, NULL, 0);
+        AXPYU_K(i + 1, 0, 0, alpha * y[i], x,     1, a, 1, NULL, 0);
+        a += lda;
+      }
+    } else {
+      for (i = 0; i < n; i++){
+        AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
+        AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
+        a += 1 + lda;
+      }
+    }
+    return;
+  }
+
+	  
   FUNCTION_PROFILE_START();
 
   if (incx < 0 ) x -= (n - 1) * incx;
diff --git a/interface/syrk.c b/interface/syrk.c
index 7699db683..edb113d6c 100644
--- a/interface/syrk.c
+++ b/interface/syrk.c
@@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
 #endif
 
   args.common = NULL;
+#ifndef COMPLEX
+#ifdef DOUBLE
+  if (args.n < 100)
+#else
+  if (args.n < 200)
+#endif
+#else
+  if (args.n < 65)
+#endif
+  args.nthreads = 1;
+  else
   args.nthreads = num_cpu_avail(3);
 
   if (args.nthreads == 1) {
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index da3b48ead..0e168606d 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
 
     blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0,
 #ifndef CONJ
-		       (void *)AXPYU_K,
+                       (int (*)(void))AXPYU_K,
 #else
-		       (void *)AXPYC_K,
+                       (int (*)(void))AXPYC_K,
 #endif
 		       nthreads);
   }
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index 87964e20d..ecda5ef4e 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 
 	if ( *lda >  *ldb )
-                msize = (*lda) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
         else
-                msize = (*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
 
         b = malloc(msize);
         if ( b == NULL )
diff --git a/interface/zrot.c b/interface/zrot.c
index 1c45f685b..228c5ee45 100644
--- a/interface/zrot.c
+++ b/interface/zrot.c
@@ -42,14 +42,20 @@
 #include "functable.h"
 #endif
 
+#ifndef CBLAS
 void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){
-
   BLASLONG n    = *N;
   BLASLONG incx = *INCX;
   BLASLONG incy = *INCY;
   FLOAT c = *C;
   FLOAT s = *S;
 
+#else
+void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) {
+    FLOAT *x = (FLOAT*) VX;
+    FLOAT *y = (FLOAT*) VY;
+#endif /* CBLAS */
+
   PRINT_DEBUG_NAME;
 
   if (n <= 0) return;
diff --git a/interface/zrotg.c b/interface/zrotg.c
index 8caa411fc..123f4da85 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -4,8 +4,16 @@
 #include "functable.h"
 #endif
 
+#ifndef CBLAS
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
+#else
+void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
+    FLOAT *DA = (FLOAT*) VDA;
+    FLOAT *DB = (FLOAT*) VDB;
+    FLOAT *S  = (FLOAT*) VS;
+#endif /* CBLAS */
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
 
   long double da_r = *(DA + 0);
@@ -79,8 +87,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
       aa_i = fabs(da_r);
     }
 
-    scale = (aa_i / aa_r);
-    ada = aa_r * sqrt(ONE + scale * scale);
+    if (aa_r == ZERO) {
+	ada = 0.;
+    } else {
+        scale = (aa_i / aa_r);
+        ada = aa_r * sqrt(ONE + scale * scale);
+    }
 
     bb_r = fabs(db_r);
     bb_i = fabs(db_i);
@@ -90,9 +102,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
       bb_i = fabs(bb_r);
     }
 
-    scale = (bb_i / bb_r);
-    adb = bb_r * sqrt(ONE + scale * scale);
-
+    if (bb_r == ZERO) {
+	adb = 0.;
+    } else {
+    	scale = (bb_i / bb_r);
+    	adb = bb_r * sqrt(ONE + scale * scale);
+    }
     scale = ada + adb;
 
     aa_r    = da_r / scale;
diff --git a/interface/zscal.c b/interface/zscal.c
index bfaddc260..498377343 100644
--- a/interface/zscal.c
+++ b/interface/zscal.c
@@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
     mode  =  BLAS_SINGLE | BLAS_COMPLEX;
 #endif
 
-    blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
+    blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
 
   }
 #endif
diff --git a/interface/zsyr.c b/interface/zsyr.c
index 09b1de578..54fb8a4e9 100644
--- a/interface/zsyr.c
+++ b/interface/zsyr.c
@@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT  *ALPHA,
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
 
   FLOAT *buffer;
-  int trans, uplo;
+  int uplo;
   blasint info;
   FLOAT * ALPHA = &alpha;
   FLOAT alpha_r	= ALPHA[0];
@@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
 
   PRINT_DEBUG_CNAME;
 
-  trans = -1;
   uplo  = -1;
   info  =  0;
 
@@ -172,6 +171,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
 
   if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
 
+  if (incx == 1 && n < 50) {
+    blasint i;
+    if (!uplo) {
+      for (i = 0; i < n; i++){
+        if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
+          AXPYU_K(i + 1, 0, 0,
+              alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
+              alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
+              x,         1, a, 1, NULL, 0);
+        }
+        a += lda;
+      }
+    } else {
+      for (i = 0; i < n; i++){
+        if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
+          AXPYU_K(n - i, 0, 0,
+              alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
+              alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
+              x + i * 2, 1, a, 1, NULL, 0);
+        }
+        a += 2 + lda;
+      }
+    }
+    return;
+  }
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 6d8d759ad..98c803e71 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
     if (${DYNAMIC_ARCH})
         include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
     endif ()
+    ParseMakefileVars("${KERNELDIR}/KERNEL")
+    ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
     SetDefaultL1()
     SetDefaultL2()
     SetDefaultL3()
-    ParseMakefileVars("${KERNELDIR}/KERNEL")
-    ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}")
 
     set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h)
     if(NOT NO_LAPACK)
@@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
     GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
     GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
 
+    # sbdot
+    if (BUILD_BFLOAT16)
+	    GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE")
+	    GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16")
+    endif()
+
     if ((BUILD_COMPLEX OR BUILD_DOUBLE)  AND NOT BUILD_SINGLE)
     GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
     GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
@@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
     GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
     foreach (float_type ${FLOAT_TYPES})
       string(SUBSTRING ${float_type} 0 1 float_char)
-      if (${float_type} STREQUAL "BFLOAT16")
-	set (float_char "SB")
-      endif ()
       if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
         GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
         GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
@@ -185,12 +191,17 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	    GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
 	    GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
     endif ()
+    if (BUILD_BFLOAT16)
+	    GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16")
+	    GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16")
+    endif ()
     # Makefile.L3
     set(USE_TRMM false)
-    if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
+    string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
+    if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS))
       set(USE_TRMM true)
     endif ()
-    if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
+    if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
       set(USE_TRMM true)
     endif ()
 
@@ -208,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	  GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false  SINGLE)
     endif()
 
-    foreach (float_type SINGLE DOUBLE BFLOAT16)
+    foreach (float_type SINGLE DOUBLE)
       string(SUBSTRING ${float_type} 0 1 float_char)
-      if (${float_type} STREQUAL "BFLOAT16")
-        if (NOT ${BUILD_BFLOAT16})
-	  continue ()
-        else ()
-	  set (float_char "SB")
-      endif ()
-      endif ()
       GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
     endforeach()
     if (BUILD_COMPLEX16  AND NOT BUILD_DOUBLE)
@@ -252,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	    GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
     endif ()
 
+    if (BUILD_BFLOAT16)
+        if (SBGEMMINCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+        if (SBGEMMITCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+        if (SBGEMMONCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+        if (SBGEMMOTCOPY)
+		GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16")
+        endif ()
+	GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
+	GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
+    endif ()
     foreach (float_type ${FLOAT_TYPES})
       string(SUBSTRING ${float_type} 0 1 float_char)
-      if (${float_type} STREQUAL "BFLOAT16")
-	set (float_char "SB")
-      endif ()
       if (${float_char}GEMMINCOPY)
         GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
       endif ()
@@ -306,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 
 
         #hemm
-      GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
+if (NOT DEFINED ${float_char}HEMMUTCOPY_M)
+    set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+    set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+    set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}")
+    set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}")
+endif()
+      GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type})
 
       # symm for c and z
+if (NOT DEFINED ${float_char}SYMMUCOPY_M)
+	set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
+	set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
+endif()
       GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
 
       GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
+
 
-      GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
+if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
+	set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
+	set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
+	set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
+	set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
+endif ()
+      GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED ZTRSMCOPYLN_M)
+  set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+  set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}")
+  set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}")
+  set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}")
+  set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}")
+endif ()
+      GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
 
@@ -401,52 +456,82 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
         GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
 
         # symm for s and d
+if (NOT DEFINED ${float_char}SYMMUCOPY_M)
+	set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
+	set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
+endif()
         GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
-        GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
+        GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
 
         GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
-        GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
+        GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
 
       # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define.
       # Could simplify it a bit by pairing up by -UUNIT/-DUNIT.
 
-      GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
+if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
+	set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+	set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+	set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
+	set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
+	set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
+	set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
+endif ()
+      GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
+
+if (NOT DEFINED TRSMCOPYLN_M)
+  set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
+  set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
+else ()
+  set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}")
+  set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}")
+  set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}")
+  set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}")
+endif ()
+      GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
 
-      GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
-      GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
+      GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
       GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
 
@@ -457,7 +542,155 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type})
       GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type})
 
+      if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c)
+        else ()
+          set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
+        endif ()
+      endif ()
+      if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT)
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c)
+        else ()
+          set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
+        endif ()
+      endif ()
+
+      if (SMALL_MATRIX_OPT)
+        GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type})
+        if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type})
 
+        else ()
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
+            GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
+        endif ()
+	if (BUILD_BFLOAT16)
+      if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
+          set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_NN)
+          set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_NT)
+          set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_TN)
+          set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_TT)
+          set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_NN)
+          set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_NT)
+          set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_TN)
+          set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
+      endif ()
+      if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
+          set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
+      endif ()
+	    GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
+            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
+        endif ()
+      endif ()
 
       if (NOT DEFINED ${float_char}OMATCOPY_CN)
         if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
@@ -591,6 +824,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       #geadd
       GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
     endforeach ()
+
     if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
 	    GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
 	    GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
@@ -729,22 +963,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
       GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE")
 
       if (SGEMMINCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE")
       endif ()
-	    if (SGEMMITCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
-	    endif ()
-	    if (SGEMMONCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
-	    endif ()
-	    if (SGEMMOTCOPY)
-		    GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
+      if (SGEMMITCOPY)
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE")
+      endif ()
+      if (SGEMMONCOPY)
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE")
+      endif ()
+      if (SGEMMOTCOPY)
+	    GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE")
       endif ()
       GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
       GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
     endif ()
-      
-    if (BUILD_COMPLEX16  AND NOT BUILD_DOUBLE)
+
+    if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
 	GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX}  false "DOUBLE")
 	GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX}  false "DOUBLE")
     endif ()
diff --git a/kernel/Makefile b/kernel/Makefile
index fb1d5d39a..cbe4cde6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -31,12 +31,27 @@ ifdef NO_AVX2
 endif
 
 ifdef TARGET_CORE
-ifeq ($(TARGET_CORE), COOPERLAKE)
+ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
+ ifeq ($(GCCVERSIONGTEQ10), 1) 
+  override CFLAGS += -march=sapphirerapids
+ else 
+  override CFLAGS += -march=skylake-avx512 -mavx512f
+ endif 
+ ifeq ($(OSNAME), CYGWIN_NT)
+  override CFLAGS += -fno-asynchronous-unwind-tables
+ endif
+ ifeq ($(OSNAME), WINNT)
+  ifeq ($(C_COMPILER), GCC)
+   override CFLAGS += -fno-asynchronous-unwind-tables
+  endif
+ endif
+else ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
  ifeq ($(GCCVERSIONGTEQ10), 1) 
   override CFLAGS += -march=cooperlake
  else 
-  override CFLAGS += -march=skylake-avx512
+  override CFLAGS += -march=skylake-avx512 -mavx512f
  endif 
  ifeq ($(OSNAME), CYGWIN_NT)
   override CFLAGS += -fno-asynchronous-unwind-tables
@@ -47,7 +62,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
   endif
  endif
 else ifeq ($(TARGET_CORE), SKYLAKEX)
- override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
  ifeq ($(OSNAME), CYGWIN_NT)
   override CFLAGS += -fno-asynchronous-unwind-tables
  endif
@@ -58,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
  endif
 else ifeq ($(TARGET_CORE), HASWELL)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
+else ifeq ($(TARGET_CORE), LOONGSON3R4)
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
 endif
@@ -68,6 +85,9 @@ else
 TARGET_CORE = $(CORE)
 KDIR =
 TSUFFIX =
+ifeq ($(TARGET_CORE), LOONGSON3R4)
+  override CFLAGS += $(MSA_FLAGS)
+endif
 endif
 
 -include $(KERNELDIR)/KERNEL.$(TARGET_CORE)
diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1
index 7ad94118a..09337363d 100644
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -1,3 +1,11 @@
+FMAFLAG=
+ifndef OLDGCC
+ifdef HAVE_FMA3
+FMAFLAG = -mfma
+endif
+endif
+
+
 ### AMAX ###
 
 ifndef SAMAXKERNEL
@@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 	$(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@
 
 $(KDIR)srot_k$(TSUFFIX).$(SUFFIX)  $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SROTKERNEL)
-	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE  $< -o $@
+	$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE  $< -o $@
 
 $(KDIR)drot_k$(TSUFFIX).$(SUFFIX)  $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(DROTKERNEL)
-	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE  $< -o $@
+	$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE  $< -o $@
 
 $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX)  $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(QROTKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2
index 888a9b959..ac53c29c3 100644
--- a/kernel/Makefile.L2
+++ b/kernel/Makefile.L2
@@ -1,3 +1,10 @@
+FMAFLAG=
+ifndef OLDGCC
+ifdef HAVE_FMA3
+FMAFLAG = -mfma
+endif
+endif
+
 ### GEMV ###
 
 ifndef SGEMVNKERNEL
@@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KER
 	$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX  -UTRANS $< -o $@
 
 $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX)  $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
-	$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX  -DTRANS $< -o $@
+	$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX  -DTRANS $< -o $@
 endif
 
 $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(QGEMVNKERNEL)
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 893713769..bea6cb048 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64)
 USE_TRMM = 1
 endif
 
-ifeq ($(TARGET), LOONGSON3B)
-USE_TRMM = 1
-endif
-
 ifneq ($(DYNAMIC_ARCH), 1)
 ifeq ($(TARGET), GENERIC)
 USE_TRMM = 1
@@ -51,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), SAPPHIRERAPIDS)
+USE_TRMM = 1
+endif
+
 ifeq ($(CORE), ZEN)
 USE_TRMM = 1
 endif
@@ -451,6 +451,72 @@ XBLASOBJS += \
 
 endif
 
+######  BLAS small matrix optimization #####
+ifeq ($(SMALL_MATRIX_OPT), 1)
+
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS += \
+	sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
+endif
+
+SBLASOBJS += \
+	sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
+
+DBLASOBJS += \
+	dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
+
+CBLASOBJS += \
+	cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
+	cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
+
+ZBLASOBJS += \
+	zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
+	zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
+
+endif
+
 ######  BLAS extensions #####
 
 ifeq ($(BUILD_SINGLE),1)
@@ -551,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
 $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
 
+ifeq ($(ARCH), E2K)
+USE_TRMM = 1
+endif
+
 
 ifeq ($(BUILD_BFLOAT16), 1)
 
@@ -822,6 +892,8 @@ ifeq ($(OS), AIX)
 	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
 	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
+else ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
 endif
@@ -832,6 +904,8 @@ ifeq ($(OS), AIX)
 	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
 	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
+else ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
 endif
@@ -842,6 +916,8 @@ ifeq ($(OS), AIX)
 	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
 	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
+else ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
 endif
@@ -852,6 +928,8 @@ ifeq ($(OS), AIX)
 	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
 	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
+else ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
 endif
@@ -1048,6 +1126,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
 	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
 endif
@@ -1058,6 +1138,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
 	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
 endif
@@ -1068,6 +1150,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
 	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
 endif
@@ -1078,6 +1162,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
 	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
 endif
@@ -1088,6 +1174,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
 	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
 endif
@@ -1098,6 +1186,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
 	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
 endif
@@ -1108,6 +1198,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
 	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
 endif
@@ -1118,6 +1210,8 @@ ifeq ($(OS), AIX)
 	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
 	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
+else ifeq ($(CORE), SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 endif
@@ -1191,29 +1285,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
+endif
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
+endif
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
+endif
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
+endif
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
+endif
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
+endif
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
+ifeq ($(CORE),SANDYBRIDGE)
+	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+else
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
 endif
+endif
 
 
 
@@ -1367,29 +1487,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
 
 
+ifdef STRMMUNCOPY_M
+$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef STRMMLNCOPY_M
+$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef STRMMUTCOPY_M
+$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef STRMMLTCOPY_M
+$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1415,29 +1567,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N
 $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef DTRMMUNCOPY_M
+$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef DTRMMLNCOPY_M
+$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
+
+ifdef DTRMMUTCOPY_M
+$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
+$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef DTRMMLTCOPY_M
+$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1511,29 +1695,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N
 $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef CTRMMUNCOPY_M
+$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+ifdef CTRMMLNCOPY_M
+$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
+$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
+$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef CTRMMUTCOPY_M
+$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+ifdef CTRMMLTCOPY_M
+$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
+$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1559,29 +1775,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_
 $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRMMUNCOPY_M
+$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef ZTRMMLNCOPY_M
+$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMUTCOPY_M
+$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRMMLTCOPY_M
+$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -1661,11 +1909,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).
 $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef SSYMMUCOPY_M
+$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef SSYMMLCOPY_M
+$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1673,11 +1931,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).
 $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef DSYMMUCOPY_M
+$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef DSYMMLCOPY_M
+$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1697,11 +1965,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N)
 $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef CSYMMUCOPY_M
+$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef CSYMMLCOPY_M
+$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1709,11 +1987,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N)
 $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
 
+ifdef ZSYMMUCOPY_M
+$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+else
 $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
+endif
 
+ifdef ZSYMMLCOPY_M
+$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+else
 $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
+endif
 
 $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
@@ -1733,11 +2021,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N
 $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
 
+ifdef CHEMMUTCOPY_M
+$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+else
 $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+endif
 
+ifdef CHEMMLTCOPY_M
+$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+else
 $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+endif
 
 $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
@@ -1745,11 +2043,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N
 $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
 
+ifdef ZHEMMUTCOPY_M
+$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+else
 $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
+endif
 
+ifdef ZHEMMLTCOPY_M
+$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+else
 $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
+endif
 
 $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
@@ -2087,29 +2395,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR
 $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
 	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
 
+ifdef TRSMCOPYUN_M
+$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYLN_M
+$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
+
+ifdef TRSMCOPYUT_M
+$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
+$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef TRSMCOPYLT_M
+$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2135,29 +2475,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N
 $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef TRSMCOPYUN_M
+$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef TRSMCOPYLN_M
+$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef TRSMCOPYUT_M
+$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef TRSMCOPYLT_M
+$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2231,29 +2603,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N
 $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRSMCOPYUN_M
+$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLN_M
+$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
+
+ifdef ZTRSMCOPYUT_M
+$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
+$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef ZTRSMCOPYLT_M
+$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -2279,29 +2683,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_
 $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
 
+ifdef ZTRSMCOPYUN_M
+$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
+
+ifdef ZTRSMCOPYLN_M
+$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
+$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYUT_M
+$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
+endif
 
+ifdef ZTRSMCOPYLT_M
+$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
+
+$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
+	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+else
 $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
 
 $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
+endif
 
 $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
 	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
@@ -4191,3 +4627,469 @@ endif
 $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
 
+
+
+######  BLAS small matrix optimization #####
+
+ifndef DGEMM_SMALL_M_PERMIT
+DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
+endif
+
+ifndef DGEMM_SMALL_K_NN
+DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef DGEMM_SMALL_K_NT
+DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef DGEMM_SMALL_K_TN
+DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef DGEMM_SMALL_K_TT
+DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+
+ifndef DGEMM_SMALL_K_B0_NN
+DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef DGEMM_SMALL_K_B0_NT
+DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef DGEMM_SMALL_K_B0_TN
+DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef DGEMM_SMALL_K_B0_TT
+DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+ifndef SGEMM_SMALL_M_PERMIT
+SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
+endif
+
+ifndef SGEMM_SMALL_K_NN
+SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef SGEMM_SMALL_K_NT
+SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef SGEMM_SMALL_K_TN
+SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef SGEMM_SMALL_K_TT
+SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+
+ifndef SGEMM_SMALL_K_B0_NN
+SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef SGEMM_SMALL_K_B0_NT
+SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef SGEMM_SMALL_K_B0_TN
+SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef SGEMM_SMALL_K_B0_TT
+SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+
+ifeq ($(BUILD_BFLOAT16), 1)
+ifndef SBGEMM_SMALL_M_PERMIT
+SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
+endif
+
+ifndef SBGEMM_SMALL_K_NN
+SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef SBGEMM_SMALL_K_NT
+SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef SBGEMM_SMALL_K_TN
+SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef SBGEMM_SMALL_K_TT
+SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
+
+ifndef SBGEMM_SMALL_K_B0_NN
+SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
+endif
+
+ifndef SBGEMM_SMALL_K_B0_NT
+SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
+endif
+
+ifndef SBGEMM_SMALL_K_B0_TN
+SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
+endif
+
+ifndef SBGEMM_SMALL_K_B0_TT
+SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+
+$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
+endif
+
+ifndef CGEMM_SMALL_M_PERMIT
+CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
+endif
+
+ifndef CGEMM_SMALL_K_NN
+CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
+endif
+
+ifndef CGEMM_SMALL_K_NT
+CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
+endif
+
+ifndef CGEMM_SMALL_K_TN
+CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
+endif
+
+ifndef CGEMM_SMALL_K_TT
+CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
+
+$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
+	
+$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
+
+$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
+
+$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
+
+$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
+
+$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+
+$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
+
+$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@
+
+$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
+
+$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
+
+$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
+
+$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@
+
+$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
+
+$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
+
+$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
+
+$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
+
+ifndef CGEMM_SMALL_K_B0_NN
+CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
+endif
+
+ifndef CGEMM_SMALL_K_B0_NT
+CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
+endif
+
+ifndef CGEMM_SMALL_K_B0_TN
+CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
+endif
+
+ifndef CGEMM_SMALL_K_B0_TT
+CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
+	
+$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
+
+$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
+
+ifndef ZGEMM_SMALL_M_PERMIT
+ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
+endif
+
+ifndef ZGEMM_SMALL_K_NN
+ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
+endif
+
+ifndef ZGEMM_SMALL_K_NT
+ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
+endif
+
+ifndef ZGEMM_SMALL_K_TN
+ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
+endif
+
+ifndef ZGEMM_SMALL_K_TT
+ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
+
+
+$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
+	
+$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
+
+$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
+
+$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
+
+$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
+
+$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
+
+$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
+
+$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@
+
+$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
+
+$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
+
+$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
+
+$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@
+
+$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
+
+$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
+
+$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
+
+$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
+
+ifndef ZGEMM_SMALL_K_B0_NN
+ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
+endif
+
+ifndef ZGEMM_SMALL_K_B0_NT
+ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
+endif
+
+ifndef ZGEMM_SMALL_K_B0_TN
+ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
+endif
+
+ifndef ZGEMM_SMALL_K_B0_TT
+ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
+endif
+
+$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
+	
+$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
+
+$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
diff --git a/kernel/arm/omatcopy_rt.c b/kernel/arm/omatcopy_rt.c
index 9d58350d5..3d90ac6e4 100644
--- a/kernel/arm/omatcopy_rt.c
+++ b/kernel/arm/omatcopy_rt.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
+Copyright (c) 2021, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-/*****************************************************
- * 2014/06/09 Saar
- *
- * Order rowMajor
- * Trans
- *
-******************************************************/
-
 int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
 {
-	BLASLONG i,j;
-	FLOAT *aptr,*bptr;
+    BLASLONG i, j;
+    FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
 
-	if ( rows <= 0     )  return(0);
-	if ( cols <= 0     )  return(0);
+    if (rows <= 0)  return 0;
+    if (cols <= 0)  return 0;
 
-	aptr = a;
+    a_offset = a;
+    b_offset = b;
 
-	for ( i=0; i<rows ; i++ )
-	{
-		bptr = &b[i];
-		for(j=0; j<cols; j++)
-		{
-			bptr[j*ldb] = alpha * aptr[j];
-		}
-		aptr += lda;
-	}
+    i = (rows >> 2);
+    if (i > 0) {
+        do {
+            a_offset1 = a_offset;
+            a_offset2 = a_offset1 + lda;
+            a_offset3 = a_offset2 + lda;
+            a_offset4 = a_offset3 + lda;
+            a_offset += 4 * lda;
 
-	return(0);
+            b_offset1 = b_offset;
+            b_offset2 = b_offset1 + ldb;
+            b_offset3 = b_offset2 + ldb;
+            b_offset4 = b_offset3 + ldb;
+            b_offset += 4;
+
+            j = (cols >> 2);
+            if (j > 0) {
+                do {
+                /* Column 1 of MAT_B */
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+                *(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
+                *(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
+
+                /* Column 2 of MAT_B */
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
+                *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+                *(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
+                *(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
+
+                /* Column 3 of MAT_B */
+                *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
+                *(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
+                *(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
+                *(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
+
+                /* Column 4 of MAT_B */
+                *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
+                *(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
+                *(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
+                *(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
+
+                a_offset1 += 4;
+                a_offset2 += 4;
+                a_offset3 += 4;
+                a_offset4 += 4;
+                b_offset1 += ldb * 4;
+                b_offset2 += ldb * 4;
+                b_offset3 += ldb * 4;
+                b_offset4 += ldb * 4;
+                
+                j--;
+                } while (j > 0);
+            } // if(j > 0)
+
+
+            if (cols & 2) {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+                *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+
+                *(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
+                *(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
+
+                *(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
+                *(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
+
+                a_offset1 += 2;
+                a_offset2 += 2;
+                a_offset3 += 2;
+                a_offset4 += 2;
+
+                b_offset1 += ldb*2;
+
+            }
+
+            if (cols & 1) {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+
+                *(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
+
+                *(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
+            }
+
+            i--;
+        } while (i > 0);
+    }
 
-}
 
+    if (rows & 2) {
+        a_offset1 = a_offset;
+        a_offset2 = a_offset1 + lda;
+        a_offset += 2 * lda;
+
+        b_offset1 = b_offset;
+        b_offset2 = b_offset1 + ldb;
+        b_offset3 = b_offset2 + ldb;
+        b_offset4 = b_offset3 + ldb;
+        b_offset += 2;
+
+        j = (cols >> 2);
+        if (j > 0){
+            do {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+                *(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
+                *(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
+
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+                *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+                *(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
+                *(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
+                
+                a_offset1 += 4;
+                a_offset2 += 4;
+                b_offset1 += ldb * 4;
+                b_offset2 += ldb * 4;
+                b_offset3 += ldb * 4;
+                b_offset4 += ldb * 4;
+
+                j--;
+            } while (j > 0);
+        }
+
+
+        if (cols & 2){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+            *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+
+            *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+            *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+
+            a_offset1 += 2;
+            a_offset2 += 2;
+            b_offset1 += ldb*2;
+
+        }
+
+
+        if (cols & 1){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+            *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+        }
+    } // if (rows & 2)
+
+
+    if (rows & 1) {
+        a_offset1 = a_offset;
+        a_offset += lda;
+
+        b_offset1 = b_offset;
+        b_offset2 = b_offset1 + ldb;
+        b_offset3 = b_offset2 + ldb;
+        b_offset4 = b_offset3 + ldb;
+
+        j = (cols >> 2);
+        if (j > 0){
+            do {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+                *(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
+                *(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
+                
+                a_offset1 += 4;
+                b_offset1 += ldb * 4;
+                b_offset2 += ldb * 4;
+                b_offset3 += ldb * 4;
+                b_offset4 += ldb * 4;
+                
+                j--;
+            } while (j > 0);
+        }
+
+        if (cols & 2){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+            *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+
+            a_offset1 += 2;
+            b_offset1 += ldb * 2;
+        }
+
+        if (cols & 1){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+        }
+    }
+
+    return 0;
+}
 
diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index 9249b54f8..79baa61b1 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 	dot[0]=0.0;
 	dot[1]=0.0;
-#if !defined(__PPC__) && !defined(__SunOS)
+#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
 	CREAL(result) = 0.0 ;
 	CIMAG(result) = 0.0 ;
 #else
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i++ ;
 
 	}
-#if !defined(__PPC__)	&& !defined(__SunOS)
+#if !defined(__PPC__)	&& !defined(__SunOS) && !defined(__PGI)
         CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
 #else
diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
new file mode 100644
index 000000000..bd25f7cd8
--- /dev/null
+++ b/kernel/arm64/KERNEL.A64FX
@@ -0,0 +1,216 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+TRSMCOPYLN_M    = trsm_lncopy_sve.c
+TRSMCOPYLT_M    = trsm_ltcopy_sve.c
+TRSMCOPYUN_M    = trsm_uncopy_sve.c
+TRSMCOPYUT_M    = trsm_utcopy_sve.c
+
+CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
+ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
+ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
+ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
+
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = axpy.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = asum.S
+DASUMKERNEL    = asum.S
+CASUMKERNEL    = casum.S
+ZASUMKERNEL    = zasum.S
+
+SCOPYKERNEL    = copy.S
+DCOPYKERNEL    = copy.S
+CCOPYKERNEL    = copy.S
+ZCOPYKERNEL    = copy.S
+
+SSWAPKERNEL    = swap.S
+DSWAPKERNEL    = swap.S
+CSWAPKERNEL    = swap.S
+ZSWAPKERNEL    = swap.S
+
+ISAMAXKERNEL   = iamax.S
+IDAMAXKERNEL   = iamax.S
+ICAMAXKERNEL   = izamax.S
+IZAMAXKERNEL   = izamax.S
+
+SNRM2KERNEL    = nrm2.S
+DNRM2KERNEL    = nrm2.S
+CNRM2KERNEL    = znrm2.S
+ZNRM2KERNEL    = znrm2.S
+
+DDOTKERNEL     = dot.S
+ifneq ($(C_COMPILER), PGI)
+SDOTKERNEL     = ../generic/dot.c
+else
+SDOTKERNEL = dot.S
+endif
+ifneq ($(C_COMPILER), PGI)
+CDOTKERNEL     = zdot.S
+ZDOTKERNEL     = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
+
+SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
+SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
+SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
+SSYMMUCOPY_M    =  symm_ucopy_sve.c
+SSYMMLCOPY_M    =  symm_lcopy_sve.c
+
+DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
+
+DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
+DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
+DSYMMUCOPY_M    =  symm_ucopy_sve.c
+DSYMMLCOPY_M    =  symm_lcopy_sve.c
+
+CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
+CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+
+ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index 603e47d87..c8a53c86b 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -97,9 +97,18 @@ CNRM2KERNEL    = znrm2.S
 ZNRM2KERNEL    = znrm2.S
 
 DDOTKERNEL     = dot.S
+ifneq ($(C_COMPILER), PGI)
 SDOTKERNEL     = ../generic/dot.c
+else
+SDOTKERNEL = dot.S
+endif
+ifneq ($(C_COMPILER), PGI)
 CDOTKERNEL     = zdot.S
 ZDOTKERNEL     = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
 DSDOTKERNEL    = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
new file mode 100644
index 000000000..bd25f7cd8
--- /dev/null
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -0,0 +1,216 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+TRSMCOPYLN_M    = trsm_lncopy_sve.c
+TRSMCOPYLT_M    = trsm_ltcopy_sve.c
+TRSMCOPYUN_M    = trsm_uncopy_sve.c
+TRSMCOPYUT_M    = trsm_utcopy_sve.c
+
+CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
+ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
+ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
+ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
+
+ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
+ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
+ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
+ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
+
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = axpy.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = asum.S
+DASUMKERNEL    = asum.S
+CASUMKERNEL    = casum.S
+ZASUMKERNEL    = zasum.S
+
+SCOPYKERNEL    = copy.S
+DCOPYKERNEL    = copy.S
+CCOPYKERNEL    = copy.S
+ZCOPYKERNEL    = copy.S
+
+SSWAPKERNEL    = swap.S
+DSWAPKERNEL    = swap.S
+CSWAPKERNEL    = swap.S
+ZSWAPKERNEL    = swap.S
+
+ISAMAXKERNEL   = iamax.S
+IDAMAXKERNEL   = iamax.S
+ICAMAXKERNEL   = izamax.S
+IZAMAXKERNEL   = izamax.S
+
+SNRM2KERNEL    = nrm2.S
+DNRM2KERNEL    = nrm2.S
+CNRM2KERNEL    = znrm2.S
+ZNRM2KERNEL    = znrm2.S
+
+DDOTKERNEL     = dot.S
+ifneq ($(C_COMPILER), PGI)
+SDOTKERNEL     = ../generic/dot.c
+else
+SDOTKERNEL = dot.S
+endif
+ifneq ($(C_COMPILER), PGI)
+CDOTKERNEL     = zdot.S
+ZDOTKERNEL     = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
+
+SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
+SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
+SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
+SSYMMUCOPY_M    =  symm_ucopy_sve.c
+SSYMMLCOPY_M    =  symm_lcopy_sve.c
+
+DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
+
+DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
+DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
+DSYMMUCOPY_M    =  symm_ucopy_sve.c
+DSYMMLCOPY_M    =  symm_lcopy_sve.c
+
+CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
+CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+
+ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
+
+ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53
index e23133e52..e2e006770 100644
--- a/kernel/arm64/KERNEL.CORTEXA53
+++ b/kernel/arm64/KERNEL.CORTEXA53
@@ -96,11 +96,20 @@ DNRM2KERNEL    = nrm2.S
 CNRM2KERNEL    = znrm2.S
 ZNRM2KERNEL    = znrm2.S
 
-DDOTKERNEL     = dot.S
-SDOTKERNEL     = ../generic/dot.c
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-DSDOTKERNEL    = dot.S
+ifneq ($(C_COMPILER), PGI)
+SDOTKERNEL   = ../generic/dot.c
+else
+SDOTKERNEL   = dot.S
+endif
+DDOTKERNEL   = dot.S
+ifneq ($(C_COMPILER), PGI)
+CDOTKERNEL   = zdot.S
+ZDOTKERNEL   = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
+DSDOTKERNEL  = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
 SGEMM_BETA     = sgemm_beta.S
@@ -132,7 +141,7 @@ SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
 DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
 
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
@@ -160,7 +169,7 @@ endif
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
 CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
@@ -173,7 +182,7 @@ CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
 ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
 ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55
new file mode 100644
index 000000000..e2e006770
--- /dev/null
+++ b/kernel/arm64/KERNEL.CORTEXA55
@@ -0,0 +1,196 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = axpy.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = asum.S
+DASUMKERNEL    = asum.S
+CASUMKERNEL    = casum.S
+ZASUMKERNEL    = zasum.S
+
+SCOPYKERNEL    = copy.S
+DCOPYKERNEL    = copy.S
+CCOPYKERNEL    = copy.S
+ZCOPYKERNEL    = copy.S
+
+SSWAPKERNEL    = swap.S
+DSWAPKERNEL    = swap.S
+CSWAPKERNEL    = swap.S
+ZSWAPKERNEL    = swap.S
+
+ISAMAXKERNEL   = iamax.S
+IDAMAXKERNEL   = iamax.S
+ICAMAXKERNEL   = izamax.S
+IZAMAXKERNEL   = izamax.S
+
+SNRM2KERNEL    = nrm2.S
+DNRM2KERNEL    = nrm2.S
+CNRM2KERNEL    = znrm2.S
+ZNRM2KERNEL    = znrm2.S
+
+ifneq ($(C_COMPILER), PGI)
+SDOTKERNEL   = ../generic/dot.c
+else
+SDOTKERNEL   = dot.S
+endif
+DDOTKERNEL   = dot.S
+ifneq ($(C_COMPILER), PGI)
+CDOTKERNEL   = zdot.S
+ZDOTKERNEL   = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
+DSDOTKERNEL  = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
+else
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+endif
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57
index dcf2383a9..0be334893 100644
--- a/kernel/arm64/KERNEL.CORTEXA57
+++ b/kernel/arm64/KERNEL.CORTEXA57
@@ -70,10 +70,19 @@ DCOPYKERNEL  = copy.S
 CCOPYKERNEL  = copy.S
 ZCOPYKERNEL  = copy.S
 
+ifneq ($(C_COMPILER), PGI)
 SDOTKERNEL   = ../generic/dot.c
+else
+SDOTKERNEL   = dot.S
+endif
 DDOTKERNEL   = dot.S
+ifneq ($(C_COMPILER), PGI)
 CDOTKERNEL   = zdot.S
 ZDOTKERNEL   = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
 DSDOTKERNEL  = dot.S
 
 SNRM2KERNEL  = nrm2.S
diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2
new file mode 100644
index 000000000..ea010db42
--- /dev/null
+++ b/kernel/arm64/KERNEL.NEOVERSEN2
@@ -0,0 +1,189 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot_thunderx2t99.c
+SDOTKERNEL     = dot_thunderx2t99.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
new file mode 100644
index 000000000..ea010db42
--- /dev/null
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -0,0 +1,189 @@
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
+
+SROTKERNEL   = rot.S
+DROTKERNEL   = rot.S
+CROTKERNEL   = zrot.S
+ZROTKERNEL   = zrot.S
+
+SSCALKERNEL  = scal.S
+DSCALKERNEL  = scal.S
+CSCALKERNEL  = zscal.S
+ZSCALKERNEL  = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot_thunderx2t99.c
+SDOTKERNEL     = dot_thunderx2t99.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
+DSDOTKERNEL    = dot.S
+
+DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX
index cb02c7bc5..669f62698 100644
--- a/kernel/arm64/KERNEL.THUNDERX
+++ b/kernel/arm64/KERNEL.THUNDERX
@@ -47,8 +47,13 @@ ZCOPYKERNEL  = copy.S
 
 SDOTKERNEL   = dot_thunderx.c
 DDOTKERNEL   = ddot_thunderx.c
+ifneq ($(C_COMPILER), PGI)
 CDOTKERNEL   = zdot.S
 ZDOTKERNEL   = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
 DSDOTKERNEL  = dot.S
 
 SNRM2KERNEL  = nrm2.S
diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110
index 1ce7bb7c0..54d016e17 100644
--- a/kernel/arm64/KERNEL.TSV110
+++ b/kernel/arm64/KERNEL.TSV110
@@ -72,8 +72,13 @@ ZCOPYKERNEL  = copy.S
 
 SDOTKERNEL   = dot.S
 DDOTKERNEL   = dot.S
+ifneq ($(C_COMPILER), PGI)
 CDOTKERNEL   = zdot.S
 ZDOTKERNEL   = zdot.S
+else
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+endif
 DSDOTKERNEL  = dot.S
 
 SNRM2KERNEL  = nrm2.S
diff --git a/kernel/arm64/KERNEL.VORTEX b/kernel/arm64/KERNEL.VORTEX
index e3efef1f5..46a34469c 100644
--- a/kernel/arm64/KERNEL.VORTEX
+++ b/kernel/arm64/KERNEL.VORTEX
@@ -1 +1 @@
-include $(KERNELDIR)/KERNEL.ARMV8
+include $(KERNELDIR)/KERNEL.NEOVERSEN1
diff --git a/kernel/arm64/cgemm_kernel_8x4_cortexa53.c b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c
new file mode 100644
index 000000000..f9cd97852
--- /dev/null
+++ b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c
@@ -0,0 +1,898 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <arm_neon.h>
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define FMLA_RI "fmla "
+#define FMLA_IR "fmla "
+#define FMLA_II "fmls "
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define FMLA_RI "fmls "
+#define FMLA_IR "fmla "
+#define FMLA_II "fmla "
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define FMLA_RI "fmla "
+#define FMLA_IR "fmls "
+#define FMLA_II "fmla "
+#else
+#define FMLA_RI "fmls "
+#define FMLA_IR "fmls "
+#define FMLA_II "fmls "
+#endif
+#define FMLA_RR "fmla "
+
+static inline void store_m8n1_contracted(float *C,
+  float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i,
+  float alphar, float alphai) {
+
+  float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8);
+  ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar);
+  ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar);
+  ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai);
+  ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai);
+  ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai);
+  ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai);
+  ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar);
+  ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar);
+  vst2q_f32(C, ld1);
+  vst2q_f32(C + 8, ld2);
+}
+
+static inline void kernel_8x4(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  const float *c_pref = C;
+  float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i;
+  float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i;
+
+  /** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */
+  /** v0-v1 and v10-v11 for B, v2-v9 for A */
+  __asm__ __volatile__(
+    "cmp %[K],#0; mov %[c_pref],%[C]\n\t"
+    "movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
+    "movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
+    "movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
+    "movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
+    "movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
+    "movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
+    "movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t"
+    "movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "movi %[c6i].16b,#0\n\t"
+    "movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t"
+    "movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
+    "beq 4f\n\t"
+    "cmp %[K],#2\n\t"
+    "ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t"
+    "ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t"
+    "mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t"
+    "bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t"
+    "bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t"
+    
+    "blt 3f; beq 2f\n\t"
+    "1:\n\t"
+    "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
+    FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
+    FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t"
+    FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
+    "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
+    FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
+    FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
+    FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
+    "fmov v5.d[1],x0; fmov d1,x2\n\t"
+    FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t"
+    FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t"
+    FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
+    "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
+    FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
+    FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t"
+    FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" 
+    "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
+    FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
+    FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
+    FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
+    "fmov v7.d[1],x0; fmov d10,x5\n\t"
+    FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
+    FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t"
+    FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" 
+    "fmov v10.d[1],x6; fmov d11,x2\n\t"
+    FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t"
+    FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t"
+    FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t"
+    "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
+    FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
+    FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t"
+    FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t" 
+    "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
+    FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
+    FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t"
+    FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
+    "fmov v9.d[1],x0; fmov d0,x5\n\t"
+    FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t" 
+    FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t"
+    FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
+    "fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t"
+    FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t"
+    FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
+    FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t"
+    "fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t" 
+    FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
+    FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t"
+    FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t"
+    FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t"
+    FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t"
+    FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t"
+    FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t"
+    FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t"
+    FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t"
+    FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t"
+    FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
+    FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
+    FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
+    FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
+    FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
+    FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
+    FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
+    FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
+    FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
+    FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
+    FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
+    "bgt 1b; blt 3f\n\t"
+    "2:\n\t"
+    "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
+    FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
+    FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t"
+    FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
+    "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
+    FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
+    FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t"
+    FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
+    "fmov v5.d[1],x0; fmov d1,x2\n\t"
+    FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t"
+    FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t"
+    FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
+    "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t"
+    FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t"
+    FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t"
+    FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" 
+    "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t"
+    FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t"
+    FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t"
+    FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t"
+    "fmov v7.d[1],x0; fmov d10,x5\n\t"
+    FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t"
+    FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t"
+    FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" 
+    "fmov v10.d[1],x6; fmov d11,x2\n\t"
+    FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t"
+    FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
+    FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t"
+    "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t"
+    FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t"
+    FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t"
+    FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
+    "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t"
+    FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t"
+    FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t"
+    FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t"
+    "fmov v9.d[1],x0\n\t"
+    FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
+    FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
+    FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
+    FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t"
+    FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t"
+    FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t"
+    FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t"
+    FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t"
+    FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t"
+    FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t"
+    FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t"
+    FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t"
+    FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t"
+    FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t"
+    FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t"
+    FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t"
+    FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t"
+    FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t"
+    FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t"
+    "b 4f\n\t"
+    "3:\n\t"
+    "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t"
+    FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t"
+    FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t"
+    FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t"
+    "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t"
+    FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t"
+    FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t"
+    FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t"
+    "fmov v5.d[1],x0; fmov d1,x2\n\t"
+    FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t"
+    FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t"
+    FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t"
+    "fmov v1.d[1],x4\n\t"
+    FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t"
+    FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t"
+    FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t"
+    FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t"
+    FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t"
+    FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t"
+    FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t"
+    FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t"
+    FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t"
+    FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t"
+    FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t"
+    FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t"
+    "4:\n\t"
+    "mov %[c_pref],%[C]\n\t"
+    "zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
+    "zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
+    "zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t"
+    "zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
+    "zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t"
+    "zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t"
+    "zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t"
+    "zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t"
+    "zip2 %[c2i].2d,v4.2d,v5.2d\n\t"
+    "zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t"
+    "zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t"
+    "zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t"
+    "zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t"
+    "zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t"
+    "zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t"
+    "zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t"
+    "zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t"
+    "zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t"
+    "zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t"
+   :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
+    [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
+    [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
+    [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
+    [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref)
+   :[C]"r"(C), [LDC]"r"(LDC)
+   :"cc","memory","x0","x1","x2","x3","x4","x5","x6",
+    "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11");
+
+  store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2;
+  store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2;
+  store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2;
+  store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai);
+}
+
+static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc,
+  float32x4_t a, float32x4_t b) {
+
+  acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0);
+  acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1);
+  acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2);
+  acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3);
+  return acc;
+}
+
+static inline float32x4x4_t expand_alpha(float alphar, float alphai) {
+  float32x4x4_t ret;
+  const float maskp[] = { -1, 1, -1, 1 };
+  const float maskn[] = { 1, -1, 1, -1 };
+  const float32x4_t vrevp = vld1q_f32(maskp);
+  const float32x4_t vrevn = vld1q_f32(maskn);
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  ret.val[0] = vdupq_n_f32(alphar);
+  ret.val[1] = vdupq_n_f32(-alphai);
+  ret.val[2] = vmulq_f32(ret.val[1], vrevn);
+  ret.val[3] = vmulq_f32(ret.val[0], vrevp);
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  ret.val[0] = vdupq_n_f32(alphar);
+  ret.val[1] = vdupq_n_f32(alphai);
+  ret.val[2] = vmulq_f32(ret.val[1], vrevp);
+  ret.val[3] = vmulq_f32(ret.val[0], vrevn);
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  ret.val[2] = vdupq_n_f32(alphai);
+  ret.val[3] = vdupq_n_f32(alphar);
+  ret.val[0] = vmulq_f32(ret.val[3], vrevn);
+  ret.val[1] = vmulq_f32(ret.val[2], vrevp);
+#else
+  ret.val[2] = vdupq_n_f32(alphai);
+  ret.val[3] = vdupq_n_f32(-alphar);
+  ret.val[0] = vmulq_f32(ret.val[3], vrevp);
+  ret.val[1] = vmulq_f32(ret.val[2], vrevn);
+#endif
+  return ret;
+}
+
+static inline void store_expanded_m2n2(float *C, BLASLONG LDC,
+  float32x4x4_t acc, float32x4x4_t expanded_alpha) {
+
+  float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2);
+  ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
+  ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]);
+  acc.val[0] = vrev64q_f32(acc.val[0]);
+  acc.val[2] = vrev64q_f32(acc.val[2]);
+  ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
+  ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]);
+  acc.val[1] = vrev64q_f32(acc.val[1]);
+  acc.val[3] = vrev64q_f32(acc.val[3]);
+  ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
+  ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]);
+  ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
+  ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]);
+  vst1q_f32(C, ld1);
+  vst1q_f32(C + LDC * 2, ld2);
+}
+
+static inline float32x4x4_t init_expanded_m2n2() {
+  float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0),
+    vdupq_n_f32(0), vdupq_n_f32(0) }};
+  return ret;
+}
+
+static inline void kernel_4x4(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x4x4_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m2n2();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
+      a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4),
+      b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b1);
+    c3 = acc_expanded_m2n2(c3, a1, b2);
+    c4 = acc_expanded_m2n2(c4, a2, b2);
+    c1 = acc_expanded_m2n2(c1, a3, b3);
+    c2 = acc_expanded_m2n2(c2, a4, b3);
+    c3 = acc_expanded_m2n2(c3, a3, b4);
+    c4 = acc_expanded_m2n2(c4, a4, b4);
+  }
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b1);
+    c3 = acc_expanded_m2n2(c3, a1, b2);
+    c4 = acc_expanded_m2n2(c4, a2, b2);
+  }
+
+  float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n2(C, LDC, c1, e_alpha);
+  store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
+  C += LDC * 4;
+  store_expanded_m2n2(C, LDC, c3, e_alpha);
+  store_expanded_m2n2(C + 4, LDC, c4, e_alpha);
+}
+
+static inline void kernel_8x2(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x4x4_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m2n2();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
+    float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
+    float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20);
+    float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b1);
+    c3 = acc_expanded_m2n2(c3, a3, b1);
+    c4 = acc_expanded_m2n2(c4, a4, b1);
+    c1 = acc_expanded_m2n2(c1, a5, b2);
+    c2 = acc_expanded_m2n2(c2, a6, b2);
+    c3 = acc_expanded_m2n2(c3, a7, b2);
+    c4 = acc_expanded_m2n2(c4, a8, b2);
+  }
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
+    float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
+    float32x4_t b1 = vld1q_f32(sb);
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b1);
+    c3 = acc_expanded_m2n2(c3, a3, b1);
+    c4 = acc_expanded_m2n2(c4, a4, b1);
+  }
+
+  float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n2(C, LDC, c1, e_alpha);
+  store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
+  store_expanded_m2n2(C + 8, LDC, c3, e_alpha);
+  store_expanded_m2n2(C + 12, LDC, c4, e_alpha);
+}
+
+static inline void kernel_4x2(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x4x4_t c1, c2;
+  c1 = c2 = init_expanded_m2n2();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
+    float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b1);
+    c1 = acc_expanded_m2n2(c1, a3, b2);
+    c2 = acc_expanded_m2n2(c2, a4, b2);
+  }
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
+    float32x4_t b1 = vld1q_f32(sb);
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b1);
+  }
+
+  float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n2(C, LDC, c1, e_alpha);
+  store_expanded_m2n2(C + 4, LDC, c2, e_alpha);
+}
+
+static inline void kernel_2x4(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x4x4_t c1, c2;
+  c1 = c2 = init_expanded_m2n2();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
+    float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16;
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a1, b2);
+    c1 = acc_expanded_m2n2(c1, a2, b3);
+    c2 = acc_expanded_m2n2(c2, a2, b4);
+  }
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa);
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4);
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a1, b2);
+  }
+
+  float32x4x4_t e_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n2(C, LDC, c1, e_alpha);
+  store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha);
+}
+
+static inline void kernel_2x2(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x4x4_t c1, c2;
+  c1 = c2 = init_expanded_m2n2();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8;
+    float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8;
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+    c2 = acc_expanded_m2n2(c2, a2, b2);
+  }
+  c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
+  c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
+  c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]);
+  c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]);
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa);
+    float32x4_t b1 = vld1q_f32(sb);
+    c1 = acc_expanded_m2n2(c1, a1, b1);
+  }
+
+  store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai));
+}
+
+static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc,
+  float32x4_t a, float32x2_t b) {
+
+  acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0);
+  acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1);
+  return acc;
+}
+
+static inline void store_expanded_m2n1(float *C,
+  float32x4x2_t acc, float32x4x4_t expanded_alpha) {
+
+  float32x4_t ld1 = vld1q_f32(C);
+  ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]);
+  acc.val[0] = vrev64q_f32(acc.val[0]);
+  ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]);
+  acc.val[1] = vrev64q_f32(acc.val[1]);
+  ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]);
+  ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]);
+  vst1q_f32(C, ld1);
+}
+
+static inline float32x4x2_t init_expanded_m2n1() {
+  float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }};
+  return ret;
+}
+
+static inline void kernel_8x1(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K) {
+
+  float32x4x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m2n1();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
+      a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12),
+      a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20),
+      a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32;
+    float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
+    c1 = acc_expanded_m2n1(c1, a1, b1);
+    c2 = acc_expanded_m2n1(c2, a2, b1);
+    c3 = acc_expanded_m2n1(c3, a3, b1);
+    c4 = acc_expanded_m2n1(c4, a4, b1);
+    c1 = acc_expanded_m2n1(c1, a5, b2);
+    c2 = acc_expanded_m2n1(c2, a6, b2);
+    c3 = acc_expanded_m2n1(c3, a7, b2);
+    c4 = acc_expanded_m2n1(c4, a8, b2);
+  }
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
+      a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12);
+    float32x2_t b1 = vld1_f32(sb);
+    c1 = acc_expanded_m2n1(c1, a1, b1);
+    c2 = acc_expanded_m2n1(c2, a2, b1);
+    c3 = acc_expanded_m2n1(c3, a3, b1);
+    c4 = acc_expanded_m2n1(c4, a4, b1);
+  }
+
+  float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n1(C, c1, expanded_alpha);
+  store_expanded_m2n1(C + 4, c2, expanded_alpha);
+  store_expanded_m2n1(C + 8, c3, expanded_alpha);
+  store_expanded_m2n1(C + 12, c4, expanded_alpha);
+}
+
+static inline void kernel_4x1(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K) {
+
+  float32x4x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m2n1();
+
+  for (; K > 1; K -= 2) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
+      a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
+    float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4;
+    c1 = acc_expanded_m2n1(c1, a1, b1);
+    c2 = acc_expanded_m2n1(c2, a2, b1);
+    c3 = acc_expanded_m2n1(c3, a3, b2);
+    c4 = acc_expanded_m2n1(c4, a4, b2);
+  }
+  c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
+  c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
+  c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
+  c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
+  if (K) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4);
+    float32x2_t b1 = vld1_f32(sb);
+    c1 = acc_expanded_m2n1(c1, a1, b1);
+    c2 = acc_expanded_m2n1(c2, a2, b1);
+  }
+
+  float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n1(C, c1, expanded_alpha);
+  store_expanded_m2n1(C + 4, c2, expanded_alpha);
+}
+
+static inline void kernel_2x1(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K) {
+
+  float32x4x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m2n1();
+
+  for (; K > 3; K -= 4) {
+    float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4),
+      a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16;
+    float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2),
+      b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8;
+    c1 = acc_expanded_m2n1(c1, a1, b1);
+    c2 = acc_expanded_m2n1(c2, a2, b2);
+    c3 = acc_expanded_m2n1(c3, a3, b3);
+    c4 = acc_expanded_m2n1(c4, a4, b4);
+  }
+  c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]);
+  c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]);
+  c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]);
+  c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]);
+  c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]);
+  c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]);
+  for (; K; K--) {
+    float32x4_t a1 = vld1q_f32(sa); sa += 4;
+    float32x2_t b1 = vld1_f32(sb); sb += 2;
+    c1 = acc_expanded_m2n1(c1, a1, b1);
+  }
+
+  float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai);
+  store_expanded_m2n1(C, c1, expanded_alpha);
+}
+
+static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) {
+  float32x2x4_t ret;
+  const float maskp[] = { -1, 1 };
+  const float maskn[] = { 1, -1 };
+  const float32x2_t vrevp = vld1_f32(maskp);
+  const float32x2_t vrevn = vld1_f32(maskn);
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  ret.val[0] = vdup_n_f32(alphar);
+  ret.val[1] = vdup_n_f32(-alphai);
+  ret.val[2] = vmul_f32(ret.val[1], vrevn);
+  ret.val[3] = vmul_f32(ret.val[0], vrevp);
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  ret.val[0] = vdup_n_f32(alphar);
+  ret.val[1] = vdup_n_f32(alphai);
+  ret.val[2] = vmul_f32(ret.val[1], vrevp);
+  ret.val[3] = vmul_f32(ret.val[0], vrevn);
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  ret.val[2] = vdup_n_f32(alphai);
+  ret.val[3] = vdup_n_f32(alphar);
+  ret.val[0] = vmul_f32(ret.val[3], vrevn);
+  ret.val[1] = vmul_f32(ret.val[2], vrevp);
+#else
+  ret.val[2] = vdup_n_f32(alphai);
+  ret.val[3] = vdup_n_f32(-alphar);
+  ret.val[0] = vmul_f32(ret.val[3], vrevp);
+  ret.val[1] = vmul_f32(ret.val[2], vrevn);
+#endif
+  return ret;
+}
+
+static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc,
+  float32x2_t a, float32x2_t b) {
+
+  acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0);
+  acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1);
+  return acc;
+}
+
+static inline void store_expanded_m1n1(float *C,
+  float32x2x2_t acc, float32x2x4_t expanded_alpha) {
+
+  float32x2_t ld1 = vld1_f32(C);
+  ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]);
+  acc.val[0] = vrev64_f32(acc.val[0]);
+  ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]);
+  acc.val[1] = vrev64_f32(acc.val[1]);
+  ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]);
+  ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]);
+  vst1_f32(C, ld1);
+}
+
+static inline float32x2x2_t init_expanded_m1n1() {
+  float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }};
+  return ret;
+}
+
+static inline void kernel_1x4(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m1n1();
+
+  for (; K; K--) {
+    float32x2_t a1 = vld1_f32(sa); sa += 2;
+    c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
+    c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
+    c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4));
+    c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6));
+    sb += 8;
+  }
+
+  float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
+  store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
+  store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2;
+  store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2;
+  store_expanded_m1n1(C, c4, expanded_alpha);
+}
+
+static inline void kernel_1x2(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K, BLASLONG LDC) {
+
+  float32x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m1n1();
+
+  for (; K > 1; K -= 2) {
+    float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4;
+    c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
+    c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
+    c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4));
+    c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6));
+    sb += 8;
+  }
+  c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
+  c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
+  c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
+  c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
+  if (K) {
+    float32x2_t a1 = vld1_f32(sa);
+    c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb));
+    c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2));
+  }
+
+  float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai);
+  store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2;
+  store_expanded_m1n1(C, c2, expanded_alpha);
+}
+
+static inline void kernel_1x1(const float *sa, const float *sb, float *C,
+  float alphar, float alphai, BLASLONG K) {
+
+  float32x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init_expanded_m1n1();
+
+  for (; K > 3; K -= 4) {
+    c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
+    c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2));
+    c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4));
+    c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6));
+    sa += 8; sb += 8;
+  }
+  c1.val[0] = vadd_f32(c1.val[0], c3.val[0]);
+  c1.val[1] = vadd_f32(c1.val[1], c3.val[1]);
+  c2.val[0] = vadd_f32(c2.val[0], c4.val[0]);
+  c2.val[1] = vadd_f32(c2.val[1], c4.val[1]);
+  c1.val[0] = vadd_f32(c1.val[0], c2.val[0]);
+  c1.val[1] = vadd_f32(c1.val[1], c2.val[1]);
+  for (; K; K--) {
+    c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb));
+    sa += 2; sb += 2;
+  }
+
+  store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai));
+}
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
+  FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
+
+  BLASLONG n_left = N;
+  for (; n_left >= 8; n_left -= 8) {
+    const FLOAT *a_ = sa;
+    FLOAT *c1_ = C;
+    FLOAT *c2_ = C + LDC * 8;
+    const FLOAT *b1_ = sb;
+    const FLOAT *b2_ = sb + K * 8;
+    BLASLONG m_left = M;
+    for (; m_left >= 8; m_left -= 8) {
+      kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC);
+      kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC);
+      a_ += 16 * K;
+      c1_ += 16;
+      c2_ += 16;
+    }
+    if (m_left >= 4) {
+      m_left -= 4;
+      kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC);
+      kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC);
+      a_ += 8 * K;
+      c1_ += 8;
+      c2_ += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC);
+      kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC);
+      a_ += 4 * K;
+      c1_ += 4;
+      c2_ += 4;
+    }
+    if (m_left) {
+      kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC);
+      kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC);
+    }
+    C += 16 * LDC;
+    sb += 16 * K;
+  }
+
+  if (n_left >= 4) {
+    n_left -= 4;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    BLASLONG m_left = M;
+    for (; m_left >= 8; m_left -= 8) {
+      kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC);
+      a_ += 16 * K;
+      c_ += 16;
+    }
+    if (m_left >= 4) {
+      m_left -= 4;
+      kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC);
+      a_ += 8 * K;
+      c_ += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC);
+      a_ += 4 * K;
+      c_ += 4;
+    }
+    if (m_left) {
+      kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC);
+    }
+    C += 8 * LDC;
+    sb += 8 * K;
+  }
+
+  if (n_left >= 2) {
+    n_left -= 2;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    BLASLONG m_left = M;
+    for (; m_left >= 8; m_left -= 8) {
+      kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC);
+      a_ += 16 * K;
+      c_ += 16;
+    }
+    if (m_left >= 4) {
+      m_left -= 4;
+      kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC);
+      a_ += 8 * K;
+      c_ += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC);
+      a_ += 4 * K;
+      c_ += 4;
+    }
+    if (m_left) {
+      kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC);
+    }
+    C += 4 * LDC;
+    sb += 4 * K;
+  }
+
+  if (n_left) {
+    BLASLONG m_left = M;
+    for (; m_left >= 8; m_left -= 8) {
+      kernel_8x1(sa, sb, C, alphar, alphai, K);
+      sa += 16 * K;
+      C += 16;
+    }
+    if (m_left >= 4) {
+      m_left -= 4;
+      kernel_4x1(sa, sb, C, alphar, alphai, K);
+      sa += 8 * K;
+      C += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x1(sa, sb, C, alphar, alphai, K);
+      sa += 4 * K;
+      C += 4;
+    }
+    if (m_left) {
+      kernel_1x1(sa, sb, C, alphar, alphai, K);
+    }
+  }
+  return 0;
+}
+
diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..38770f66b
--- /dev/null
+++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		w19
+#define alphaI		w20
+
+#define alphaz_R	z6.s
+#define alphaz_I	z7.s
+#define alpha0_R	s4
+#define alpha0_I	s5
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+	dup		z20.s, #0
+	dup		z21.s, #0
+	dup		z22.s, #0
+	dup		z23.s, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+	fmla	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.s, p1/m, z0.s, z9.s
+#else
+	fmla	z17.s, p1/m, z0.s, z9.s
+#endif
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+
+	fmla	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.s, p1/m, z0.s, z11.s
+#else
+	fmla	z19.s, p1/m, z0.s, z11.s
+#endif
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+
+	fmla	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.s, p1/m, z0.s, z13.s
+#else
+	fmla	z21.s, p1/m, z0.s, z13.s
+#endif
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+
+	fmla	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.s, p1/m, z0.s, z15.s
+#else
+	fmla	z23.s, p1/m, z0.s, z15.s
+#endif
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2w	{z2.s, z3.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4
+
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+    ld1rw  z12.s, p0/z,  [pB, 16]
+    ld1rw  z13.s, p0/z,  [pB, 20]
+    ld1rw  z14.s, p0/z,  [pB, 24]
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 32
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	ld2w	{z26.s, z27.s}, p1/z, [pCRow1]
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ld2w	{z28.s, z29.s}, p1/z, [pCRow2]
+	fmla	z28.s, p1/m, z20.s, alphaz_R
+	fmls	z28.s, p1/m, z21.s, alphaz_I
+	fmla	z29.s, p1/m, z20.s, alphaz_I
+	fmla	z29.s, p1/m, z21.s, alphaz_R
+	st2w 	{z28.s, z29.s}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #3
+
+	ld2w	{z30.s, z31.s}, p1/z, [pCRow3]
+	fmla	z30.s, p1/m, z22.s, alphaz_R
+	fmls	z30.s, p1/m, z23.s, alphaz_I
+	fmla	z31.s, p1/m, z22.s, alphaz_I
+	fmla	z31.s, p1/m, z23.s, alphaz_R
+	st2w 	{z30.s, z31.s}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 16
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	ld2w	{z26.s, z27.s}, p1/z, [pCRow1]
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.s, #0
+	dup		z17.s, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, s0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, s1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4
+    ptrue p0.s                  // create true predicate 
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lcgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lcgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lcgemm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3
+	cmp	counterL , #2
+	blt	.Lcgemm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lcgemm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L4_Mv1_22
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lcgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lcgemm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lcgemm_kernel_L4_Mv1_44
+
+
+.Lcgemm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lcgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lcgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Lcgemm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lcgemm_kernel_L4_Mv1_46
+
+.Lcgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lcgemm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lcgemm_kernel_L4_Mv1_20   
+
+
+
+.Lcgemm_kernel_L4_END:
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp		// B = B + K * 4 * 4 * 2
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lcgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lcgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lcgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lcgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lcgemm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lcgemm_kernel_L2_Mv1_40
+	.align 5
+
+.Lcgemm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L2_Mv1_22
+
+
+.Lcgemm_kernel_L2_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lcgemm_kernel_L2_Mv1_100
+
+.Lcgemm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L2_Mv1_42
+
+.Lcgemm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+.Lcgemm_kernel_L2_Mv1_END:
+
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lcgemm_kernel_L2_Mv1_20   
+
+
+.Lcgemm_kernel_L2_END:
+	lsl	temp, origK, #4
+	add	origPB, origPB, temp // B = B + K * 2 * 4 * 2
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lcgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+.Lcgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lcgemm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lcgemm_kernel_L1_Mv1_40
+	.align 5
+
+.Lcgemm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L1_Mv1_22
+
+
+.Lcgemm_kernel_L1_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lcgemm_kernel_L1_Mv1_100
+
+.Lcgemm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lcgemm_kernel_L1_Mv1_42
+
+.Lcgemm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+.Lcgemm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lcgemm_kernel_L1_Mv1_20   
+
+.Lcgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lcgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..6aa44a8f6
--- /dev/null
+++ b/kernel/arm64/cgemm_ncopy_sve_v1.c
@@ -0,0 +1,79 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint32_t lda_vec = svindex_s32(0, lda * 2);
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
+            svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec);
+            svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag));
+            aoffset1 += 2;
+            boffset += active * 2;
+        }
+        aoffset += active * lda * 2;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..748cd954e
--- /dev/null
+++ b/kernel/arm64/cgemm_tcopy_sve_v1.c
@@ -0,0 +1,75 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1);
+            svst2_f32(pg, (float *) boffset, a_vec);
+            aoffset1 += lda * 2;
+            boffset += active * 2;
+        }
+        aoffset += active * 2;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/ctrmm_kernel_sve_v1x4.S b/kernel/arm64/ctrmm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..242968f63
--- /dev/null
+++ b/kernel/arm64/ctrmm_kernel_sve_v1x4.S
@@ -0,0 +1,1006 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		w19
+#define alphaI		w20
+#define temp		x21
+#define tempOffset	x22
+#define tempK		x23
+
+#define alphaz_R	z6.s
+#define alphaz_I	z7.s
+#define alpha0_R	s6
+#define alpha0_I	s7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+	dup		z20.s, #0
+	dup		z21.s, #0
+	dup		z22.s, #0
+	dup		z23.s, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+	ld2w	{z2.s, z3.s}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #3    // pA += lanes*2*4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+	fmla	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.s, p1/m, z0.s, z9.s
+#else
+	fmla	z17.s, p1/m, z0.s, z9.s
+#endif
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+
+	fmla	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.s, p1/m, z0.s, z11.s
+#else
+	fmla	z19.s, p1/m, z0.s, z11.s
+#endif
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+
+	fmla	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.s, p1/m, z0.s, z13.s
+#else
+	fmla	z21.s, p1/m, z0.s, z13.s
+#endif
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+
+	fmla	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.s, p1/m, z0.s, z15.s
+#else
+	fmla	z23.s, p1/m, z0.s, z15.s
+#endif
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2w	{z2.s, z3.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+    add pB, pB, 32
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes *2 * 4
+
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z,  [pB, 8]
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+    ld1rw  z12.s, p0/z,  [pB, 16]
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+    ld1rw  z13.s, p0/z,  [pB, 20]
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z,  [pB, 24]
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 32
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.s, p1/m, z2.s, z8.s
+	OP_ir	z17.s, p1/m, z3.s, z8.s
+	OP_ii	z16.s, p1/m, z3.s, z9.s
+	OP_ri	z17.s, p1/m, z2.s, z9.s
+
+	OP_rr	z18.s, p1/m, z2.s, z10.s
+	OP_ir	z19.s, p1/m, z3.s, z10.s
+	OP_ii	z18.s, p1/m, z3.s, z11.s
+	OP_ri	z19.s, p1/m, z2.s, z11.s
+
+	OP_rr	z20.s, p1/m, z2.s, z12.s
+	OP_ir	z21.s, p1/m, z3.s, z12.s
+	OP_ii	z20.s, p1/m, z3.s, z13.s
+	OP_ri	z21.s, p1/m, z2.s, z13.s
+
+	OP_rr	z22.s, p1/m, z2.s, z14.s
+	OP_ir	z23.s, p1/m, z3.s, z14.s
+	OP_ii	z22.s, p1/m, z3.s, z15.s
+	OP_ri	z23.s, p1/m, z2.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+    ld1rw  z12.s, p0/z,  [pB, 16]
+    ld1rw  z13.s, p0/z,  [pB, 20]
+    ld1rw  z14.s, p0/z,  [pB, 24]
+    ld1rw  z15.s, p0/z,  [pB, 28]
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 32
+
+	OP_rr	z20.s, p1/m, z0.s, z12.s
+	OP_ir	z21.s, p1/m, z1.s, z12.s
+	OP_ii	z20.s, p1/m, z1.s, z13.s
+	OP_ri	z21.s, p1/m, z0.s, z13.s
+
+	OP_rr	z22.s, p1/m, z0.s, z14.s
+	OP_ir	z23.s, p1/m, z1.s, z14.s
+	OP_ii	z22.s, p1/m, z1.s, z15.s
+	OP_ri	z23.s, p1/m, z0.s, z15.s
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	eor	z28.d, z16.d, z16.d
+	eor	z29.d, z16.d, z16.d
+	fmla	z28.s, p1/m, z20.s, alphaz_R
+	fmls	z28.s, p1/m, z21.s, alphaz_I
+	fmla	z29.s, p1/m, z20.s, alphaz_I
+	fmla	z29.s, p1/m, z21.s, alphaz_R
+	st2w 	{z28.s, z29.s}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #3
+
+	eor	z30.d, z16.d, z16.d
+	eor	z31.d, z16.d, z16.d
+	fmla	z30.s, p1/m, z22.s, alphaz_R
+	fmls	z30.s, p1/m, z23.s, alphaz_I
+	fmla	z31.s, p1/m, z22.s, alphaz_I
+	fmla	z31.s, p1/m, z23.s, alphaz_R
+	st2w 	{z30.s, z31.s}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.s, #0
+	dup		z17.s, #0
+	dup		z18.s, #0
+	dup		z19.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z,  [pB, 8]
+    ld1rw  z11.s, p0/z,  [pB, 12]
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+
+	OP_rr	z18.s, p1/m, z0.s, z10.s
+	OP_ir	z19.s, p1/m, z1.s, z10.s
+	OP_ii	z18.s, p1/m, z1.s, z11.s
+	OP_ri	z19.s, p1/m, z0.s, z11.s
+
+    add pB, pB, 16
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.s, p1/m, z18.s, alphaz_R
+	fmls	z26.s, p1/m, z19.s, alphaz_I
+	fmla	z27.s, p1/m, z18.s, alphaz_I
+	fmla	z27.s, p1/m, z19.s, alphaz_R
+	st2w 	{z26.s, z27.s}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #3
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.s, #0
+	dup		z17.s, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2w	{z0.s, z1.s}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes* 2  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+	OP_rr	z16.s, p1/m, z0.s, z8.s
+	OP_ir	z17.s, p1/m, z1.s, z8.s
+	OP_ii	z16.s, p1/m, z1.s, z9.s
+	OP_ri	z17.s, p1/m, z0.s, z9.s
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.s, p1/m, z16.s, alphaz_R
+	fmls	z24.s, p1/m, z17.s, alphaz_I
+	fmla	z25.s, p1/m, z16.s, alphaz_I
+	fmla	z25.s, p1/m, z17.s, alphaz_R
+	st2w 	{z24.s, z25.s}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, s0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, s1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 2 * 4
+    ptrue p0.s                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lctrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lctrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+.Lctrmm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3
+	cmp	counterL , #2
+	blt	.Lctrmm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lctrmm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L4_Mv1_22
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lctrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lctrmm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lctrmm_kernel_L4_Mv1_44
+
+
+.Lctrmm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lctrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lctrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lctrmm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lctrmm_kernel_L4_Mv1_46
+
+.Lctrmm_kernel_L4_Mv1_100:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lctrmm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lctrmm_kernel_L4_Mv1_20   
+
+
+
+.Lctrmm_kernel_L4_END:
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lctrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lctrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lctrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lctrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lctrmm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lctrmm_kernel_L2_Mv1_40
+	.align 5
+
+.Lctrmm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L2_Mv1_22
+
+
+.Lctrmm_kernel_L2_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lctrmm_kernel_L2_Mv1_100
+
+.Lctrmm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L2_Mv1_42
+
+.Lctrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lctrmm_kernel_L2_Mv1_END:
+
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lctrmm_kernel_L2_Mv1_20   
+
+
+.Lctrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+	lsl	temp, origK, #4
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lctrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+.Lctrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+
+.Lctrmm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3   // add tempOffset*lanes*4*2
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lctrmm_kernel_L1_Mv1_40
+	.align 5
+
+.Lctrmm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L1_Mv1_22
+
+
+.Lctrmm_kernel_L1_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lctrmm_kernel_L1_Mv1_100
+
+.Lctrmm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lctrmm_kernel_L1_Mv1_42
+
+.Lctrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3  // add tempOffset*lanes*4*2
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lctrmm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lctrmm_kernel_L1_Mv1_20   
+
+.Lctrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lctrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dgemm_kernel_4x4_cortexa53.c b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c
new file mode 100644
index 000000000..5a9d284df
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c
@@ -0,0 +1,890 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <arm_neon.h>
+
+/**********************************************************
+ * Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12
+ * Operation: C[4][12] += alpha * sa[4][K] * sb[K][12]
+ * Matrix orders:
+ *    sa: column-major (leading dimension == 4)
+ *    sb: 3 concatenated row-major 4-column submatrices
+ *    C: column-major (leading dimension == LDC)
+ *********************************************************/
+static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  /** prefetch 4x12 elements from matrix C for RW purpose */
+  __asm__ __volatile__(
+    "mov x0,%[C]\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t"
+    "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t"
+   ::[C]"r"(C), [LDC]"r"(LDC):"x0");
+
+  /** 3 pointers to 3 submatrices of sb respectively */
+  const FLOAT *b1_ = sb;
+  const FLOAT *b2_ = sb + K * 4;
+  const FLOAT *b3_ = sb + K * 8;
+
+  /** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */
+  /** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */
+  /** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */
+  /** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */
+  /** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */
+
+  __asm__ __volatile__(
+    "cmp %[K],#0\n\t"
+    /** fill registers holding elements of C with 0.0 */
+    "movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t"
+    "movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t"
+    "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t"
+    "movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t"
+    "movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t"
+    "movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t"
+    "beq 4f; cmp %[K],#2\n\t"
+    /** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */
+    "ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t"
+    "ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t"
+    "blt 3f; beq 2f\n\t"
+    "1:\n\t"
+    /** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */
+    "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
+    "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
+    "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
+    "fmla v10.2d,v0.2d,v4.d[1]\n\t"
+    "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
+    "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
+    "fmla v12.2d,v0.2d,v5.d[0]\n\t"
+    "fmla v13.2d,v1.2d,v5.d[0]\n\t"
+    "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
+    "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
+    "fmla v15.2d,v1.2d,v5.d[1]\n\t"
+    "fmla v16.2d,v0.2d,v6.d[0]\n\t"
+    "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
+    "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
+    "fmla v18.2d,v0.2d,v6.d[1]\n\t"
+    "fmla v19.2d,v1.2d,v6.d[1]\n\t"
+    "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
+    "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
+    "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
+    "fmla v22.2d,v0.2d,v7.d[1]\n\t"
+    "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
+    "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
+    "fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t"
+    "fmla v25.2d,v1.2d,v4.d[0]\n\t"
+    "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
+    "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
+    "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
+    "fmla v28.2d,v0.2d,v5.d[0]\n\t"
+    "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
+    "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
+    "fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t"
+    "fmla v31.2d,v1.2d,v5.d[1]\n\t"
+    "ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t"
+    "fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t"
+    "fmla v9.2d,v3.2d,v6.d[0]\n\t"
+    "fmla v10.2d,v2.2d,v6.d[1]\n\t"
+    "ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t"
+    "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
+    "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
+    "fmla v13.2d,v3.2d,v7.d[0]\n\t"
+    "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
+    "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
+    "fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t"
+    "fmla v16.2d,v2.2d,v4.d[0]\n\t"
+    "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
+    "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
+    "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
+    "fmla v19.2d,v3.2d,v4.d[1]\n\t"
+    "ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t"
+    "fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t"
+    "fmla v21.2d,v3.2d,v5.d[0]\n\t"
+    "fmla v22.2d,v2.2d,v5.d[1]\n\t"
+    "ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t"
+    "fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t"
+    "fmla v24.2d,v2.2d,v6.d[0]\n\t"
+    "fmla v25.2d,v3.2d,v6.d[0]\n\t"
+    "ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t"
+    "fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t"
+    "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
+    "fmla v28.2d,v2.2d,v7.d[0]\n\t"
+    "ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t"
+    "fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t"
+    "fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t"
+    "fmla v31.2d,v3.2d,v7.d[1]\n\t"
+    "bgt 1b; blt 3f\n\t"
+    "2:\n\t"
+    /** tail part with k = 2 */
+    "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
+    "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
+    "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t"
+    "fmla v10.2d,v0.2d,v4.d[1]\n\t"
+    "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t"
+    "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t"
+    "fmla v12.2d,v0.2d,v5.d[0]\n\t"
+    "fmla v13.2d,v1.2d,v5.d[0]\n\t"
+    "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t"
+    "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
+    "fmla v15.2d,v1.2d,v5.d[1]\n\t"
+    "fmla v16.2d,v0.2d,v6.d[0]\n\t"
+    "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
+    "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
+    "fmla v18.2d,v0.2d,v6.d[1]\n\t"
+    "fmla v19.2d,v1.2d,v6.d[1]\n\t"
+    "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t"
+    "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t"
+    "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t"
+    "fmla v22.2d,v0.2d,v7.d[1]\n\t"
+    "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t"
+    "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t"
+    "fmla v24.2d,v0.2d,v4.d[0]\n\t"
+    "fmla v25.2d,v1.2d,v4.d[0]\n\t"
+    "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t"
+    "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t"
+    "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t"
+    "fmla v28.2d,v0.2d,v5.d[0]\n\t"
+    "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t"
+    "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t"
+    "fmla v30.2d,v0.2d,v5.d[1]\n\t"
+    "fmla v31.2d,v1.2d,v5.d[1]\n\t"
+    "fmov v4.d[1],x0\n\t"
+    "fmla v8.2d,v2.2d,v6.d[0]\n\t"
+    "fmla v9.2d,v3.2d,v6.d[0]\n\t"
+    "fmla v10.2d,v2.2d,v6.d[1]\n\t"
+    "ldr d5,[%[b2_],#48]\n\t"
+    "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t"
+    "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t"
+    "fmla v13.2d,v3.2d,v7.d[0]\n\t"
+    "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t"
+    "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t"
+    "fmla v15.2d,v3.2d,v7.d[1]\n\t"
+    "fmla v16.2d,v2.2d,v4.d[0]\n\t"
+    "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t"
+    "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t"
+    "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t"
+    "fmla v19.2d,v3.2d,v4.d[1]\n\t"
+    "fmov v7.d[1],x0\n\t"
+    "fmla v20.2d,v2.2d,v5.d[0]\n\t"
+    "fmla v21.2d,v3.2d,v5.d[0]\n\t"
+    "fmla v22.2d,v2.2d,v5.d[1]\n\t"
+    "fmla v23.2d,v3.2d,v5.d[1]\n\t"
+    "fmla v24.2d,v2.2d,v6.d[0]\n\t"
+    "fmla v25.2d,v3.2d,v6.d[0]\n\t"
+    "fmla v26.2d,v2.2d,v6.d[1]\n\t"
+    "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t"
+    "fmla v28.2d,v2.2d,v7.d[0]\n\t"
+    "fmla v29.2d,v3.2d,v7.d[0]\n\t"
+    "fmla v30.2d,v2.2d,v7.d[1]\n\t"
+    "fmla v31.2d,v3.2d,v7.d[1]\n\t"
+    "b 4f\n\t"
+    "3:\n\t"
+    /** tail part with k = 1 */
+    "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t"
+    "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t"
+    "fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t"
+    "fmla v10.2d,v0.2d,v4.d[1]\n\t"
+    "fmov v7.d[1],x0\n\t"
+    "fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t"
+    "fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t"
+    "fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t"
+    "ldr d4,[%[b3_]]\n\t"
+    "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t"
+    "fmla v15.2d,v1.2d,v5.d[1]\n\t"
+    "fmla v16.2d,v0.2d,v6.d[0]\n\t"
+    "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t"
+    "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t"
+    "fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t"
+    "fmla v19.2d,v1.2d,v6.d[1]\n\t"
+    "fmov v5.d[1],x0\n\t"
+    "fmla v20.2d,v0.2d,v7.d[0]\n\t"
+    "fmla v21.2d,v1.2d,v7.d[0]\n\t"
+    "fmla v22.2d,v0.2d,v7.d[1]\n\t"
+    "fmla v23.2d,v1.2d,v7.d[1]\n\t"
+    "fmla v24.2d,v0.2d,v4.d[0]\n\t"
+    "fmla v25.2d,v1.2d,v4.d[0]\n\t"
+    "fmla v26.2d,v0.2d,v4.d[1]\n\t"
+    "fmla v27.2d,v1.2d,v4.d[1]\n\t"
+    "fmla v28.2d,v0.2d,v5.d[0]\n\t"
+    "fmla v29.2d,v1.2d,v5.d[0]\n\t"
+    "fmla v30.2d,v0.2d,v5.d[1]\n\t"
+    "fmla v31.2d,v1.2d,v5.d[1]\n\t"
+    /** store 4x12 elements to C */
+    "4:\n\t"
+    "ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t"
+    "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
+    "fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t"
+    "fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t"
+    "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
+    "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
+    "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
+    "fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t"
+    "fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t"
+    "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
+    "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
+    "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
+    "fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t"
+    "fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t"
+    "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
+    "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
+    "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
+    "fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t"
+    "fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t"
+    "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
+    "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
+    "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
+    "fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t"
+    "fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t"
+    "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t"
+    "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t"
+    "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t"
+    "fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t"
+    "fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t"
+    "stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t"
+   :[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K)
+   :[LDC]"r"(LDC), [alpha]"m"(alpha)
+   :"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+}
+
+/**********************************************************
+ * Operation:
+  C[0] += alpha * up[0]; C[1] += alpha * up[1];
+  C[2] += alpha * down[0]; C[3] += alpha * down[1];
+ *********************************************************/
+static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) {
+  float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2);
+  t1 = vfmaq_n_f64(t1, up, alpha);
+  t2 = vfmaq_n_f64(t2, down, alpha);
+  vst1q_f64(C, t1);
+  vst1q_f64(C + 2, t2);
+}
+
+/**********************************************************
+ * Function: dgemm_kernel_arm64_4x4_m4n8
+ * Operation: C[4][8] += alpha * sa[4][K] * sb[K][8]
+ * Matrix orders:
+ *    sa: column-major (leading dimension == 4)
+ *    sb: 2 concatenated row-major 4-column submatrices
+ *    C: column-major (leading dimension == LDC)
+ *********************************************************/
+static inline void dgemm_kernel_arm64_4x4_m4n8(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  const FLOAT *b1_ = sb;
+  const FLOAT *b2_ = sb + K * 4;
+
+  /** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */
+  float64x2_t c11, c12, c13, c14, c15, c16, c17, c18;
+  float64x2_t c21, c22, c23, c24, c25, c26, c27, c28;
+  c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0);
+  c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0);
+
+  for (; K; K--) {
+    float64x2_t a1 = vld1q_f64(sa);
+    float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
+
+    float64x2_t b1 = vld1q_f64(b1_);
+    c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
+    c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
+    c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
+    c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
+
+    float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4;
+    c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
+    c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
+    c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
+    c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
+
+    float64x2_t b3 = vld1q_f64(b2_);
+    c15 = vfmaq_laneq_f64(c15, a1, b3, 0);
+    c25 = vfmaq_laneq_f64(c25, a2, b3, 0);
+    c16 = vfmaq_laneq_f64(c16, a1, b3, 1);
+    c26 = vfmaq_laneq_f64(c26, a2, b3, 1);
+
+    float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4;
+    c17 = vfmaq_laneq_f64(c17, a1, b4, 0);
+    c27 = vfmaq_laneq_f64(c27, a2, b4, 0);
+    c18 = vfmaq_laneq_f64(c18, a1, b4, 1);
+    c28 = vfmaq_laneq_f64(c28, a2, b4, 1);
+  }
+
+  dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
+  dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
+  dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
+  dgemm_store_m4n1(C, c14, c24, alpha); C += LDC;
+  dgemm_store_m4n1(C, c15, c25, alpha); C += LDC;
+  dgemm_store_m4n1(C, c16, c26, alpha); C += LDC;
+  dgemm_store_m4n1(C, c17, c27, alpha); C += LDC;
+  dgemm_store_m4n1(C, c18, c28, alpha);
+}
+
+/**********************************************************
+ * Function: dgemm_kernel_arm64_4x4_m4n4
+ * Operation: C[4][4] += alpha * sa[4][K] * sb[K][4]
+ * Matrix orders:
+ *    sa: column-major (leading dimension == 4)
+ *    sb: row-major (leading dimension == 4)
+ *    C: column-major (leading dimension == LDC)
+ *********************************************************/
+static inline void dgemm_kernel_arm64_4x4_m4n4(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c11, c21, c12, c22, c13, c23, c14, c24;
+  c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0);
+
+  for (; K; K--) {
+    float64x2_t a1 = vld1q_f64(sa);
+    float64x2_t a2 = vld1q_f64(sa + 2); sa += 4;
+    float64x2_t b1 = vld1q_f64(sb);
+    float64x2_t b2 = vld1q_f64(sb + 2); sb += 4;
+    c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
+    c21 = vfmaq_laneq_f64(c21, a2, b1, 0);
+    c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
+    c22 = vfmaq_laneq_f64(c22, a2, b1, 1);
+    c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
+    c23 = vfmaq_laneq_f64(c23, a2, b2, 0);
+    c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
+    c24 = vfmaq_laneq_f64(c24, a2, b2, 1);
+  }
+
+  dgemm_store_m4n1(C, c11, c21, alpha); C += LDC;
+  dgemm_store_m4n1(C, c12, c22, alpha); C += LDC;
+  dgemm_store_m4n1(C, c13, c23, alpha); C += LDC;
+  dgemm_store_m4n1(C, c14, c24, alpha);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m4n2(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2;
+  c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0);
+
+  for (; K > 1; K -= 2) {
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
+    float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2),
+      a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8;
+    c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0);
+    c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0);
+    c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1);
+    c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1);
+    c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0);
+    c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0);
+    c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1);
+    c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1);
+  }
+  c11_1 = vaddq_f64(c11_1, c11_2);
+  c21_1 = vaddq_f64(c21_1, c21_2);
+  c12_1 = vaddq_f64(c12_1, c12_2);
+  c22_1 = vaddq_f64(c22_1, c22_2);
+  if (K) {
+    float64x2_t b1 = vld1q_f64(sb); sb += 2;
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
+    c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0);
+    c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0);
+    c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1);
+    c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1);
+  }
+
+  dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC;
+  dgemm_store_m4n1(C, c12_1, c22_1, alpha);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m4n1(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c11_1, c11_2, c21_1, c21_2;
+  c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0);
+
+  for (; K > 1; K -= 2) {
+    float64x2_t b1 = vld1q_f64(sb); sb += 2;
+    c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0);
+    c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0);
+    c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1);
+    c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1);
+    sa += 8;
+  }
+  c11_1 = vaddq_f64(c11_1, c11_2);
+  c21_1 = vaddq_f64(c21_1, c21_2);
+  if (K) {
+    double b1 = *sb++;
+    c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1);
+    c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1);
+    sa += 4;
+  }
+
+  dgemm_store_m4n1(C, c11_1, c21_1, alpha);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m2n12(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *c,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24;
+  c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 =
+    c21 = c22 = c23 = c24 = vdupq_n_f64(0);
+
+  const FLOAT *b1_ = sb;
+  const FLOAT *b2_ = sb + 4 * K;
+  const FLOAT *b3_ = b2_ + 4 * K;
+
+  for (; K; K--) {
+    const float64x2_t a1 = vld1q_f64(sa); sa += 2;
+
+    float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
+    c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
+    c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
+    c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
+    c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
+
+    b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
+    c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
+    c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
+    c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
+    c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
+
+    b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4;
+    c21 = vfmaq_laneq_f64(c21, a1, b1, 0);
+    c22 = vfmaq_laneq_f64(c22, a1, b1, 1);
+    c23 = vfmaq_laneq_f64(c23, a1, b2, 0);
+    c24 = vfmaq_laneq_f64(c24, a1, b2, 1);
+  }
+
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha));
+}
+
+static inline void dgemm_kernel_arm64_4x4_m2n8(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *c,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c01, c02, c03, c04, c11, c12, c13, c14;
+  c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0);
+
+  const FLOAT *b1_ = sb;
+  const FLOAT *b2_ = sb + 4 * K;
+
+  for (; K; K--) {
+    const float64x2_t a1 = vld1q_f64(sa); sa += 2;
+
+    float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4;
+    c01 = vfmaq_laneq_f64(c01, a1, b1, 0);
+    c02 = vfmaq_laneq_f64(c02, a1, b1, 1);
+    c03 = vfmaq_laneq_f64(c03, a1, b2, 0);
+    c04 = vfmaq_laneq_f64(c04, a1, b2, 1);
+
+    b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4;
+    c11 = vfmaq_laneq_f64(c11, a1, b1, 0);
+    c12 = vfmaq_laneq_f64(c12, a1, b1, 1);
+    c13 = vfmaq_laneq_f64(c13, a1, b2, 0);
+    c14 = vfmaq_laneq_f64(c14, a1, b2, 1);
+  }
+
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha));
+}
+
+static inline void dgemm_kernel_arm64_4x4_m2n4(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *c,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2;
+  c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0);
+
+  for (; K > 1; K -= 2) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
+    float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2);
+    float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8;
+
+    c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0);
+    c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1);
+    c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0);
+    c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1);
+
+    c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0);
+    c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1);
+    c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0);
+    c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1);
+  }
+  c1_1 = vaddq_f64(c1_1, c1_2);
+  c2_1 = vaddq_f64(c2_1, c2_2);
+  c3_1 = vaddq_f64(c3_1, c3_2);
+  c4_1 = vaddq_f64(c4_1, c4_2);
+  if (K) {
+    float64x2_t a1 = vld1q_f64(sa); sa += 2;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
+    c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
+    c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
+    c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0);
+    c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1);
+  }
+
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha));
+}
+
+static inline void dgemm_kernel_arm64_4x4_m2n2(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *c,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1_1, c1_2, c2_1, c2_2;
+  c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
+
+  for (; K > 1; K -= 2) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
+
+    c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
+    c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
+    c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0);
+    c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1);
+  }
+  c1_1 = vaddq_f64(c1_1, c1_2);
+  c2_1 = vaddq_f64(c2_1, c2_2);
+  if (K) {
+    float64x2_t a1 = vld1q_f64(sa); sa += 2;
+    float64x2_t b1 = vld1q_f64(sb); sb += 2;
+    c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0);
+    c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1);
+  }
+
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC;
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha));
+}
+
+static inline void dgemm_kernel_arm64_4x4_m2n1(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *c,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = vdupq_n_f64(0);
+
+  for (; K > 3; K -= 4) {
+    float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4;
+    c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0);
+    c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1);
+    c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0);
+    c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1);
+    sa += 8;
+  }
+  c1 = vaddq_f64(c1, c2);
+  c3 = vaddq_f64(c3, c4);
+  c1 = vaddq_f64(c1, c3);
+  for (; K; K--) {
+    c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++);
+    sa += 2;
+  }
+
+  vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha));
+}
+
+static inline void dgemm_store_m1n2(double *C, float64x2_t vc,
+  double alpha, BLASLONG LDC) {
+  double c0 = vgetq_lane_f64(vc, 0);
+  double c1 = vgetq_lane_f64(vc, 1);
+  C[0] += c0 * alpha;
+  C[LDC] += c1 * alpha;
+}
+
+static inline void dgemm_kernel_arm64_4x4_m1n12(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1, c2, c3, c4, c5, c6;
+  c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0);
+
+  const double *b1_ = sb;
+  const double *b2_ = sb + 4 * K;
+  const double *b3_ = b2_ + 4 * K;
+
+  for (; K; K--) {
+    const double a1 = *sa++;
+    c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
+    c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
+    c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
+    c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
+    c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1);
+    c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4;
+  }
+
+  dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c6, alpha, LDC);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m1n8(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = vdupq_n_f64(0);
+
+  const double *b1_ = sb;
+  const double *b2_ = sb + 4 * K;
+
+  for (; K; K--) {
+    const double a1 = *sa++;
+    c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1);
+    c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4;
+    c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1);
+    c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4;
+  }
+
+  dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c4, alpha, LDC);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m1n4(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1_1, c1_2, c2_1, c2_2;
+  c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0);
+
+  for (; K > 1; K -= 2) {
+    float64x2_t a1 = vld1q_f64(sa); sa += 2;
+    c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0);
+    c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0);
+    c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1);
+    c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8;
+  }
+  c1_1 = vaddq_f64(c1_1, c1_2);
+  c2_1 = vaddq_f64(c2_1, c2_2);
+  if (K) {
+    double a1 = *sa++;
+    c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1);
+    c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1);
+    sb += 4;
+  }
+
+  dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2;
+  dgemm_store_m1n2(C, c2_1, alpha, LDC);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m1n2(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = vdupq_n_f64(0);
+
+  for (; K > 3; K -= 4) {
+    float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4;
+    c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0);
+    c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1);
+    c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0);
+    c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8;
+  }
+  c1 = vaddq_f64(c1, c2);
+  c3 = vaddq_f64(c3, c4);
+  c1 = vaddq_f64(c1, c3);
+  for (; K; K--) {
+    c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++);
+    sb += 2;
+  }
+
+  dgemm_store_m1n2(C, c1, alpha, LDC);
+}
+
+static inline void dgemm_kernel_arm64_4x4_m1n1(
+  const FLOAT *sa, const FLOAT *sb, FLOAT *C,
+  BLASLONG K, BLASLONG LDC, FLOAT alpha) {
+
+  float64x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = vdupq_n_f64(0);
+
+  for (; K > 7; K -= 8) {
+    c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa));
+    c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2));
+    c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4));
+    c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6));
+    sa += 8; sb += 8;
+  }
+  c1 = vaddq_f64(c1, c2);
+  c3 = vaddq_f64(c3, c4);
+  c1 = vaddq_f64(c1, c3);
+  double cs1 = vpaddd_f64(c1);
+  for (; K; K--) {
+    cs1 += (*sa++) * (*sb++);
+  }
+
+  C[0] += cs1 * alpha;
+}
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha,
+  FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
+
+  for (; N >= 12; N -= 12) {
+    BLASLONG m_left = M;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    for (; m_left >= 4; m_left -= 4) {
+      dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha);
+      c_ += 4;
+      a_ += 4 * K;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha);
+      c_ += 2;
+      a_ += 2 * K;
+    }
+    if (m_left) {
+      dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha);
+    }
+    sb += 12 * K;
+    C += 12 * LDC;
+  }
+
+  if (N >= 8) {
+    N -= 8;
+    BLASLONG m_left = M;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    for (; m_left >= 4; m_left -= 4) {
+      dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha);
+      c_ += 4;
+      a_ += 4 * K;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha);
+      c_ += 2;
+      a_ += 2 * K;
+    }
+    if (m_left) {
+      dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha);
+    }
+    sb += 8 * K;
+    C += 8 * LDC;
+  } else if (N >= 4) {
+    N -= 4;
+    BLASLONG m_left = M;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    for (; m_left >= 4; m_left -= 4) {
+      dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha);
+      c_ += 4;
+      a_ += 4 * K;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha);
+      c_ += 2;
+      a_ += 2 * K;
+    }
+    if (m_left) {
+      dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha);
+    }
+    sb += 4 * K;
+    C += 4 * LDC;
+  }
+
+  if (N >= 2) {
+    N -= 2;
+    BLASLONG m_left = M;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    for (; m_left >= 4; m_left -= 4) {
+      dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha);
+      c_ += 4;
+      a_ += 4 * K;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha);
+      c_ += 2;
+      a_ += 2 * K;
+    }
+    if (m_left) {
+      dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha);
+    }
+    sb += 2 * K;
+    C += 2 * LDC;
+  }
+
+  if (N) {
+    BLASLONG m_left = M;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    for (; m_left >= 4; m_left -= 4) {
+      dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha);
+      c_ += 4;
+      a_ += 4 * K;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha);
+      c_ += 2;
+      a_ += 2 * K;
+    }
+    if (m_left) {
+      dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha);
+    }
+  }
+  return 0;
+}
+
diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S
new file mode 100644
index 000000000..bbbd0fd95
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+
+#define lanes		x15
+#define pA		x16
+#define alpha		x17
+
+#define alpha0		d10
+#define alphaZ		z2.d
+
+#define A_PRE_SIZE	1536
+#define B_PRE_SIZE	512
+#define C_PRE_SIZE	128
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 lanes
+// 16 pA
+// 17 
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0
+//v01 pA0_1
+//v02 ALPHA0
+//v03 
+//v04 
+//v05 
+//v06 
+//v07 
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 
+//v11 must save pB0_3
+//v12 must save pB0_4
+//v13 must save pB0_5
+//v14 must save pB0_6
+//v15 must save pB0_7
+//v16 must save C0
+//v17 must save C1
+//v18 must save C2
+//v19 must save C3
+//v20 must save C4
+//v21 must save C5
+//v22 must save C6
+//v23 must save C7
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x8
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+    dup         z20.d, #0
+    dup         z21.d, #0
+    dup         z22.d, #0
+    dup         z23.d, #0
+.endm
+
+.macro KERNELv1x8_I
+    ld1d  z0.d, p1/z, [pA] 
+    ld1d  z1.d, p1/z, [pA, lanes, lsl #3]   // next one
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z0.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z0.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_M1
+    ld1d  z1.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z0.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z0.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_M2
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    fmla z16.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z1.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z, [pB, 32]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_E
+    fmla z16.d, p1/m, z1.d, z8.d
+    fmla z17.d, p1/m, z1.d, z9.d
+    fmla z18.d, p1/m, z1.d, z10.d
+    fmla z19.d, p1/m, z1.d, z11.d
+    fmla z20.d, p1/m, z1.d, z12.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.d, p1/m, z1.d, z13.d
+    fmla z22.d, p1/m, z1.d, z14.d
+    fmla z23.d, p1/m, z1.d, z15.d
+.endm
+
+.macro KERNELv1x8_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    fmla z17.d, p1/m, z0.d, z9.d
+    fmla z18.d, p1/m, z0.d, z10.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z19.d, p1/m, z0.d, z11.d
+    fmla z20.d, p1/m, z0.d, z12.d
+    fmla z21.d, p1/m, z0.d, z13.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z22.d, p1/m, z0.d, z14.d
+    fmla z23.d, p1/m, z0.d, z15.d
+
+.endm
+
+.macro SAVEv1x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z25.d, p1/z, [pCRow1] 
+    fmla z25.d, p1/m, z17.d, alphaZ
+    st1d  z25.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z26.d, p1/z, [pCRow2] 
+    fmla z26.d, p1/m, z18.d, alphaZ
+    st1d z26.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z27.d, p1/z, [pCRow1] 
+    fmla z27.d, p1/m, z19.d, alphaZ
+    st1d  z27.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z28.d, p1/z, [pCRow2] 
+    fmla z28.d, p1/m, z20.d, alphaZ
+    st1d  z28.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z29.d, p1/z, [pCRow1] 
+    fmla z29.d, p1/m, z21.d, alphaZ
+    st1d  z29.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z30.d, p1/z, [pCRow2] 
+    fmla z30.d, p1/m, z22.d, alphaZ
+    st1d  z30.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z31.d, p1/z, [pCRow1] 
+    fmla z31.d, p1/m, z23.d, alphaZ
+    st1d  z31.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x4
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+.endm
+
+.macro KERNELv1x4_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+
+    add pB, pB, 32
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    fmla z17.d, p1/m, z0.d, z9.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z18.d, p1/m, z0.d, z10.d
+    fmla z19.d, p1/m, z0.d, z11.d
+
+.endm
+
+.macro SAVEv1x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z25.d, p1/z, [pCRow1] 
+    fmla z25.d, p1/m, z17.d, alphaZ
+    st1d  z25.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z26.d, p1/z, [pCRow2] 
+    fmla z26.d, p1/m, z18.d, alphaZ
+    st1d z26.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z27.d, p1/z, [pCRow1] 
+    fmla z27.d, p1/m, z19.d, alphaZ
+    st1d  z27.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x2
+    dup         z16.d, #0
+    dup         z17.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+    fmla z16.d, p1/m, z0.d, z8.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z17.d, p1/m, z0.d, z9.d
+
+.endm
+
+.macro SAVEv1x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z25.d, p1/z, [pCRow1] 
+    fmla z25.d, p1/m, z17.d, alphaZ
+    st1d  z25.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x1
+    dup         z16.d, #0
+.endm
+
+.macro KERNELv1x1_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+
+    add pB, pB, 8
+
+    fmla z16.d, p1/m, z0.d, z8.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+.endm
+
+.macro SAVEv1x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alpha, d0
+	dup	alphaZ, alpha
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+    ptrue p0.d                  // create true predicate 
+
+	mov	pB, origPB
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Ldgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
+
+	.align 5
+.Ldgemm_kernel_L8_BEGIN:
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #3 // add 8 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L8_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x8                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Ldgemm_kernel_L8_Mv1_32
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Ldgemm_kernel_L8_Mv1_22a
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_22:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L8_Mv1_22
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_22a:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+	b	 .Ldgemm_kernel_L8_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Ldgemm_kernel_L8_Mv1_40
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+
+	b	.Ldgemm_kernel_L8_Mv1_44
+
+.Ldgemm_kernel_L8_Mv1_40:
+
+	INITv1x8
+
+.Ldgemm_kernel_L8_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L8_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_46:
+
+	KERNELv1x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L8_Mv1_46
+
+.Ldgemm_kernel_L8_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x8
+
+.Ldgemm_kernel_L8_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Ldgemm_kernel_L8_Mv1_20   
+
+.Ldgemm_kernel_L8_END:
+
+	lsl	temp, origK, #6 
+	add	origPB, origPB, temp		// B = B + K * 8 * 8
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Ldgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
+
+	.align 5
+.Ldgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #4
+	ble	.Ldgemm_kernel_L2_BEGIN
+
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #2 // add 4 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L4_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L4_Mv1_22
+
+.Ldgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L4_Mv1_46
+
+.Ldgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Ldgemm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldgemm_kernel_L4_Mv1_20   
+
+
+.Ldgemm_kernel_L4_END:
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp	// B = B + K * 4 * 8
+
+/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
+
+	.align 5
+.Ldgemm_kernel_L2_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #2
+	ble	.Ldgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #1 // add 2 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x2                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L2_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L2_Mv1_22
+
+.Ldgemm_kernel_L2_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L2_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L2_Mv1_46
+
+.Ldgemm_kernel_L2_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x2
+
+.Ldgemm_kernel_L2_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldgemm_kernel_L2_Mv1_20   
+
+
+.Ldgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
+
+	.align 5
+.Ldgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Ldgemm_kernel_L999 // done
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC // add 1 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x1                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Ldgemm_kernel_L1_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv1_22
+
+.Ldgemm_kernel_L1_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L1_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv1_46
+
+.Ldgemm_kernel_L1_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x1
+
+.Ldgemm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldgemm_kernel_L1_Mv1_20   
+
+
+.Ldgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Ldgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S
new file mode 100644
index 000000000..023d5ba92
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S
@@ -0,0 +1,1683 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
+However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
+This means that we sweep two panels of packed A when iterating in a loop over K.
+With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+
+#define lanes		x15
+#define pA1	    	x16
+#define pA2	    	x17
+#define alpha		x18
+#define vec_len		x19
+#define vec_lenx2   x20
+
+#define alpha0		d10
+#define alphaZ		z7.d
+
+#define A_PRE_SIZE	1536
+#define B_PRE_SIZE	512
+#define C_PRE_SIZE	128
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 lanes
+// 16 pA1
+// 17 pA1
+// 18 must save alpha
+// 19 must save vec_len
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA10_0
+//v01 pA10_1
+//v02 pA20_0
+//v03 pA20_1
+//v04 
+//v05 
+//v06 
+//v07 ALPHA0
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 
+//v11 must save pB0_3
+//v12 must save pB0_4
+//v13 must save pB0_5
+//v14 must save pB0_6
+//v15 must save pB0_7
+//v16 must save C0
+//v17 must save C1
+//v18 must save C2
+//v19 must save C3
+//v20 must save C4
+//v21 must save C5
+//v22 must save C6
+//v23 must save C7
+//v24 must save C8
+//v25 must save C9
+//v26 must save C10
+//v27 must save C11
+//v28 must save C12
+//v29 must save C13
+//v30 must save C14
+//v31 must save C15
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv2x8
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+    dup         z20.d, #0
+    dup         z21.d, #0
+    dup         z22.d, #0
+    dup         z23.d, #0
+    dup         z24.d, #0
+    dup         z25.d, #0
+    dup         z26.d, #0
+    dup         z27.d, #0
+    dup         z28.d, #0
+    dup         z29.d, #0
+    dup         z30.d, #0
+    dup         z31.d, #0
+.endm
+
+.macro KERNELv2x8_I
+    ld1d  z0.d, p0/z, [pA1] 
+    ld1d  z1.d, p0/z, [pA2]   
+    ld1d  z2.d, p0/z, [pA1, vec_len, lsl #3] 
+    ld1d  z3.d, p0/z, [pA2, vec_len, lsl #3]   
+	add	pA1, pA1, vec_len, lsl #4	// pA1 = pA1 + vec_len * 8 *2
+	add	pA2, pA2, vec_len, lsl #4	// pA1 = pA1 + vec_len * 8 *2
+
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p0/m, z0.d, z8.d
+    fmla z17.d, p0/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z18.d, p0/m, z0.d, z9.d
+    fmla z19.d, p0/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z20.d, p0/m, z0.d, z10.d
+    fmla z21.d, p0/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z22.d, p0/m, z0.d, z11.d
+    fmla z23.d, p0/m, z1.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z24.d, p0/m, z0.d, z12.d
+    fmla z25.d, p0/m, z1.d, z12.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z26.d, p0/m, z0.d, z13.d
+    fmla z27.d, p0/m, z1.d, z13.d
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z28.d, p0/m, z0.d, z14.d
+    fmla z29.d, p0/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z30.d, p0/m, z0.d, z15.d
+    fmla z31.d, p0/m, z1.d, z15.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE+64]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv2x8_M1
+    ld1d  z2.d, p0/z, [pA1] 
+    ld1d  z3.d, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+
+    fmla z16.d, p0/m, z0.d, z8.d
+    fmla z17.d, p0/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z18.d, p0/m, z0.d, z9.d
+    fmla z19.d, p0/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z20.d, p0/m, z0.d, z10.d
+    fmla z21.d, p0/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z22.d, p0/m, z0.d, z11.d
+    fmla z23.d, p0/m, z1.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z24.d, p0/m, z0.d, z12.d
+    fmla z25.d, p0/m, z1.d, z12.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z26.d, p0/m, z0.d, z13.d
+    fmla z27.d, p0/m, z1.d, z13.d
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z28.d, p0/m, z0.d, z14.d
+    fmla z29.d, p0/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z30.d, p0/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    fmla z31.d, p0/m, z1.d, z15.d
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv2x8_M2
+    ld1d  z0.d, p0/z, [pA1] 
+    ld1d  z1.d, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 2 * 8
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 2 * 8
+
+    fmla z16.d, p0/m, z2.d, z8.d
+    fmla z17.d, p0/m, z3.d, z8.d 
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z18.d, p0/m, z2.d, z9.d
+    fmla z19.d, p0/m, z3.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z20.d, p0/m, z2.d, z10.d
+    fmla z21.d, p0/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z22.d, p0/m, z2.d, z11.d
+    fmla z23.d, p0/m, z3.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z24.d, p0/m, z2.d, z12.d
+    fmla z25.d, p0/m, z3.d, z12.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z26.d, p0/m, z2.d, z13.d
+    fmla z27.d, p0/m, z3.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z28.d, p0/m, z2.d, z14.d
+    fmla z29.d, p0/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z30.d, p0/m, z2.d, z15.d
+    fmla z31.d, p0/m, z3.d, z15.d
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv2x8_E
+    fmla z16.d, p0/m, z2.d, z8.d
+    fmla z17.d, p0/m, z3.d, z8.d
+    fmla z18.d, p0/m, z2.d, z9.d
+    fmla z19.d, p0/m, z3.d, z9.d
+    fmla z20.d, p0/m, z2.d, z10.d
+    fmla z21.d, p0/m, z3.d, z10.d
+    fmla z22.d, p0/m, z2.d, z11.d
+    fmla z23.d, p0/m, z3.d, z11.d
+    fmla z24.d, p0/m, z2.d, z12.d
+    fmla z25.d, p0/m, z3.d, z12.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z26.d, p0/m, z2.d, z13.d
+    fmla z27.d, p0/m, z3.d, z13.d
+    fmla z28.d, p0/m, z2.d, z14.d
+    fmla z29.d, p0/m, z3.d, z14.d
+    fmla z30.d, p0/m, z2.d, z15.d
+    fmla z31.d, p0/m, z3.d, z15.d
+.endm
+
+.macro KERNELv2x8_SUB
+    ld1d  z0.d, p0/z, [pA1] 
+    ld1d  z1.d, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p0/m, z0.d, z8.d
+    fmla z17.d, p0/m, z1.d, z8.d
+    fmla z18.d, p0/m, z0.d, z9.d
+    fmla z19.d, p0/m, z1.d, z9.d
+    fmla z20.d, p0/m, z0.d, z10.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z21.d, p0/m, z1.d, z10.d
+    fmla z22.d, p0/m, z0.d, z11.d
+    fmla z23.d, p0/m, z1.d, z11.d
+    fmla z24.d, p0/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    fmla z25.d, p0/m, z1.d, z12.d
+    fmla z26.d, p0/m, z0.d, z13.d
+    fmla z27.d, p0/m, z1.d, z13.d
+    fmla z28.d, p0/m, z0.d, z14.d
+    fmla z29.d, p0/m, z1.d, z14.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z30.d, p0/m, z0.d, z15.d
+    fmla z31.d, p0/m, z1.d, z15.d
+.endm
+
+.macro SAVEv2x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z8.d, p0/z, [pCRow0] 
+    ld1d  z9.d, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.d, p0/m, z16.d, alphaZ
+    fmla z9.d, p0/m, z17.d, alphaZ
+    st1d  z8.d, p0, [pCRow0]
+    st1d  z9.d, p0, [pCRow0, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z10.d, p0/z, [pCRow1] 
+    ld1d  z11.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.d, p0/m, z18.d, alphaZ
+    fmla z11.d, p0/m, z19.d, alphaZ
+    st1d  z10.d, p0, [pCRow1]
+    st1d  z11.d, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z12.d, p0/z, [pCRow2] 
+    ld1d  z13.d, p0/z, [pCRow2, #1, mul vl] 
+    fmla z12.d, p0/m, z20.d, alphaZ
+    fmla z13.d, p0/m, z21.d, alphaZ
+    st1d  z12.d, p0, [pCRow2]
+    st1d  z13.d, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z14.d, p0/z, [pCRow1] 
+    ld1d  z15.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z14.d, p0/m, z22.d, alphaZ
+    fmla z15.d, p0/m, z23.d, alphaZ
+    st1d  z14.d, p0, [pCRow1]
+    st1d  z15.d, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z8.d, p0/z, [pCRow2] 
+    ld1d  z9.d, p0/z, [pCRow2, #1, mul vl] 
+    fmla z8.d, p0/m, z24.d, alphaZ
+    fmla z9.d, p0/m, z25.d, alphaZ
+    st1d  z8.d, p0, [pCRow2]
+    st1d  z9.d, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z10.d, p0/z, [pCRow1] 
+    ld1d  z11.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.d, p0/m, z26.d, alphaZ
+    fmla z11.d, p0/m, z27.d, alphaZ
+    st1d  z10.d, p0, [pCRow1]
+    st1d  z11.d, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z12.d, p0/z, [pCRow2] 
+    ld1d  z13.d, p0/z, [pCRow2, #1, mul vl] 
+    fmla z12.d, p0/m, z28.d, alphaZ
+    fmla z13.d, p0/m, z29.d, alphaZ
+    st1d  z12.d, p0, [pCRow2]
+    st1d  z13.d, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z14.d, p0/z, [pCRow1] 
+    ld1d  z15.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z14.d, p0/m, z30.d, alphaZ
+    fmla z15.d, p0/m, z31.d, alphaZ
+    st1d  z14.d, p0, [pCRow1]
+    st1d  z15.d, p0, [pCRow1, #1, mul vl]
+
+	add	pCRow0, pCRow0, vec_len, lsl #4	// pC = pC + vec_len  * 8 * 2
+
+.endm
+
+.macro INITv2x4
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+    dup         z20.d, #0
+    dup         z21.d, #0
+    dup         z22.d, #0
+    dup         z23.d, #0
+.endm
+
+.macro KERNELv2x4_SUB
+    ld1d  z0.d, p0/z, [pA1] 
+    ld1d  z1.d, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+
+    add pB, pB, 32
+
+    fmla z16.d, p0/m, z0.d, z8.d
+    fmla z17.d, p0/m, z1.d, z8.d
+    fmla z18.d, p0/m, z0.d, z9.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z19.d, p0/m, z1.d, z9.d
+    fmla z20.d, p0/m, z0.d, z10.d
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    fmla z21.d, p0/m, z1.d, z10.d
+    fmla z22.d, p0/m, z0.d, z11.d
+    fmla z23.d, p0/m, z1.d, z11.d
+.endm
+
+.macro SAVEv2x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z8.d, p0/z, [pCRow0] 
+    ld1d  z9.d, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.d, p0/m, z16.d, alphaZ
+    fmla z9.d, p0/m, z17.d, alphaZ
+    st1d  z8.d, p0, [pCRow0]
+    st1d  z9.d, p0, [pCRow0, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z10.d, p0/z, [pCRow1] 
+    ld1d  z11.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.d, p0/m, z18.d, alphaZ
+    fmla z11.d, p0/m, z19.d, alphaZ
+    st1d  z10.d, p0, [pCRow1]
+    st1d  z11.d, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z12.d, p0/z, [pCRow2] 
+    ld1d  z13.d, p0/z, [pCRow2, #1, mul vl] 
+    fmla z12.d, p0/m, z20.d, alphaZ
+    fmla z13.d, p0/m, z21.d, alphaZ
+    st1d  z12.d, p0, [pCRow2]
+    st1d  z13.d, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z14.d, p0/z, [pCRow1] 
+    ld1d  z15.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z14.d, p0/m, z22.d, alphaZ
+    fmla z15.d, p0/m, z23.d, alphaZ
+    st1d  z14.d, p0, [pCRow1]
+    st1d  z15.d, p0, [pCRow1, #1, mul vl]
+
+	add	pCRow0, pCRow0, vec_len, lsl #4	// pC = pC + vec_len  * 8 * 2
+
+.endm
+
+.macro INITv2x2
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+.endm
+
+.macro KERNELv2x2_SUB
+    ld1d  z0.d, p0/z, [pA1] 
+    ld1d  z1.d, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+    fmla z16.d, p0/m, z0.d, z8.d
+    fmla z17.d, p0/m, z1.d, z8.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z18.d, p0/m, z0.d, z9.d
+    fmla z19.d, p0/m, z1.d, z9.d
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv2x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z8.d, p0/z, [pCRow0] 
+    ld1d  z9.d, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.d, p0/m, z16.d, alphaZ
+    fmla z9.d, p0/m, z17.d, alphaZ
+    st1d  z8.d, p0, [pCRow0]
+    st1d  z9.d, p0, [pCRow0, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z10.d, p0/z, [pCRow1] 
+    ld1d  z11.d, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.d, p0/m, z18.d, alphaZ
+    fmla z11.d, p0/m, z19.d, alphaZ
+    st1d  z10.d, p0, [pCRow1]
+    st1d  z11.d, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+
+
+	add	pCRow0, pCRow0, vec_len, lsl #4	// pC = pC + vec_len  * 8 * 2
+.endm
+
+.macro INITv2x1
+    dup         z16.d, #0
+    dup         z17.d, #0
+.endm
+
+.macro KERNELv2x1_SUB
+    ld1d  z0.d, p0/z, [pA1] 
+    ld1d  z1.d, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+
+    add pB, pB, 8
+
+    fmla z16.d, p0/m, z0.d, z8.d
+    fmla z17.d, p0/m, z1.d, z8.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv2x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z8.d, p0/z, [pCRow0] 
+    ld1d  z9.d, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.d, p0/m, z16.d, alphaZ
+    fmla z9.d, p0/m, z17.d, alphaZ
+    st1d  z8.d, p0, [pCRow0]
+    st1d  z9.d, p0, [pCRow0, #1, mul vl]
+
+	add	pCRow0, pCRow0, vec_len, lsl #4	// pC = pC + vec_len  * 8 * 2
+
+.endm
+
+.macro INITv1x8
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+    dup         z20.d, #0
+    dup         z21.d, #0
+    dup         z22.d, #0
+    dup         z23.d, #0
+.endm
+
+.macro KERNELv1x8_I
+    ld1d  z0.d, p1/z, [pA1] 
+    ld1d  z1.d, p1/z, [pA1, lanes, lsl #3]   // next one
+	add	pA1, pA1, lanes, lsl #4	// pA1 = pA1 + lanes * 2 * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z0.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z0.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_M1
+    ld1d  z1.d, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes  * 8
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z0.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z0.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_M2
+    ld1d  z0.d, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes  * 8
+
+    fmla z16.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z1.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z, [pB, 32]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_E
+    fmla z16.d, p1/m, z1.d, z8.d
+    fmla z17.d, p1/m, z1.d, z9.d
+    fmla z18.d, p1/m, z1.d, z10.d
+    fmla z19.d, p1/m, z1.d, z11.d
+    fmla z20.d, p1/m, z1.d, z12.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.d, p1/m, z1.d, z13.d
+    fmla z22.d, p1/m, z1.d, z14.d
+    fmla z23.d, p1/m, z1.d, z15.d
+.endm
+
+.macro KERNELv1x8_SUB
+    ld1d  z0.d, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    fmla z17.d, p1/m, z0.d, z9.d
+    fmla z18.d, p1/m, z0.d, z10.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z19.d, p1/m, z0.d, z11.d
+    fmla z20.d, p1/m, z0.d, z12.d
+    fmla z21.d, p1/m, z0.d, z13.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z22.d, p1/m, z0.d, z14.d
+    fmla z23.d, p1/m, z0.d, z15.d
+
+
+.endm
+
+.macro SAVEv1x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z25.d, p1/z, [pCRow1] 
+    fmla z25.d, p1/m, z17.d, alphaZ
+    st1d  z25.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z26.d, p1/z, [pCRow2] 
+    fmla z26.d, p1/m, z18.d, alphaZ
+    st1d z26.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z27.d, p1/z, [pCRow1] 
+    fmla z27.d, p1/m, z19.d, alphaZ
+    st1d  z27.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z28.d, p1/z, [pCRow2] 
+    fmla z28.d, p1/m, z20.d, alphaZ
+    st1d  z28.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z29.d, p1/z, [pCRow1] 
+    fmla z29.d, p1/m, z21.d, alphaZ
+    st1d  z29.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z30.d, p1/z, [pCRow2] 
+    fmla z30.d, p1/m, z22.d, alphaZ
+    st1d  z30.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z31.d, p1/z, [pCRow1] 
+    fmla z31.d, p1/m, z23.d, alphaZ
+    st1d  z31.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x4
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+.endm
+
+.macro KERNELv1x4_SUB
+    ld1d  z0.d, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+
+    add pB, pB, 32
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    fmla z17.d, p1/m, z0.d, z9.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z18.d, p1/m, z0.d, z10.d
+    fmla z19.d, p1/m, z0.d, z11.d
+
+.endm
+
+.macro SAVEv1x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1d  z25.d, p1/z, [pCRow1] 
+    fmla z25.d, p1/m, z17.d, alphaZ
+    st1d  z25.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1d  z26.d, p1/z, [pCRow2] 
+    fmla z26.d, p1/m, z18.d, alphaZ
+    st1d z26.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z27.d, p1/z, [pCRow1] 
+    fmla z27.d, p1/m, z19.d, alphaZ
+    st1d  z27.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x2
+    dup         z16.d, #0
+    dup         z17.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+    ld1d  z0.d, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+    fmla z16.d, p1/m, z0.d, z8.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z17.d, p1/m, z0.d, z9.d
+
+.endm
+
+.macro SAVEv1x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1d  z25.d, p1/z, [pCRow1] 
+    fmla z25.d, p1/m, z17.d, alphaZ
+    st1d  z25.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x1
+    dup         z16.d, #0
+.endm
+
+.macro KERNELv1x1_SUB
+    ld1d  z0.d, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+
+    add pB, pB, 8
+
+    fmla z16.d, p1/m, z0.d, z8.d
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+
+.endm
+
+.macro SAVEv1x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+    ld1d  z24.d, p1/z, [pCRow0] 
+    fmla z24.d, p1/m, z16.d, alphaZ
+    st1d  z24.d, p1, [pCRow0]
+
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alpha, d0
+	dup	alphaZ, alpha
+    cntd vec_len
+    lsl vec_lenx2, vec_len, #1
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+    ptrue p0.d                  // create true predicate 
+
+	mov	pB, origPB
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Ldgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
+
+	.align 5
+.Ldgemm_kernel_L8_BEGIN:
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #3 // add 8 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Ldgemm_kernel_L8_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2        // Check if M < 2*SVE_LEN
+    blt .Ldgemm_kernel_L8_Mv1_BEGIN
+
+    mov counterI, origM
+
+/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			// pA1 = start of A array
+	prfm	PLDL1KEEP, [pA2]
+
+	.align 5
+.Ldgemm_kernel_L8_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x8                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Ldgemm_kernel_L8_Mv2_32
+
+	KERNELv2x8_I
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Ldgemm_kernel_L8_Mv2_22a
+
+	.align 5
+.Ldgemm_kernel_L8_Mv2_22:
+
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L8_Mv2_22
+
+	.align 5
+.Ldgemm_kernel_L8_Mv2_22a:
+
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_E
+
+	b	 .Ldgemm_kernel_L8_Mv2_44
+
+	.align 5
+.Ldgemm_kernel_L8_Mv2_32:
+
+	tst	counterL, #1
+	ble	.Ldgemm_kernel_L8_Mv2_40
+
+	KERNELv2x8_I
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_E
+
+
+	b	.Ldgemm_kernel_L8_Mv2_44
+
+.Ldgemm_kernel_L8_Mv2_40:
+
+	INITv2x8
+
+.Ldgemm_kernel_L8_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L8_Mv2_100
+
+	.align 5
+.Ldgemm_kernel_L8_Mv2_46:
+
+	KERNELv2x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L8_Mv2_46
+
+.Ldgemm_kernel_L8_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [pA2]
+	prfm	PLDL1KEEP, [pA2, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x8
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			    // 
+
+.Ldgemm_kernel_L8_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Ldgemm_kernel_L8_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Ldgemm_kernel_L8_END
+
+//////////////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
+.Ldgemm_kernel_L8_Mv1_BEGIN:
+
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x8                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Ldgemm_kernel_L8_Mv1_32
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Ldgemm_kernel_L8_Mv1_22a
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_22:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L8_Mv1_22
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_22a:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+	b	 .Ldgemm_kernel_L8_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Ldgemm_kernel_L8_Mv1_40
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+
+	b	.Ldgemm_kernel_L8_Mv1_44
+
+.Ldgemm_kernel_L8_Mv1_40:
+
+	INITv1x8
+
+.Ldgemm_kernel_L8_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L8_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_46:
+
+	KERNELv1x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L8_Mv1_46
+
+.Ldgemm_kernel_L8_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x8
+
+.Ldgemm_kernel_L8_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Ldgemm_kernel_L8_Mv1_20   
+
+.Ldgemm_kernel_L8_END:
+
+	lsl	temp, origK, #6 
+	add	origPB, origPB, temp		// B = B + K * 8 * 8
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Ldgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
+
+	.align 5
+.Ldgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #4
+	ble	.Ldgemm_kernel_L2_BEGIN
+
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #2 // add 4 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Ldgemm_kernel_L4_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2
+    blt .Ldgemm_kernel_L4_Mv1_BEGIN
+
+    mov counterI, origM
+
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			// pA1 = start of A array
+
+	.align 5
+.Ldgemm_kernel_L4_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x4                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L4_Mv2_44
+
+	.align 5
+.Ldgemm_kernel_L4_Mv2_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L4_Mv2_22
+
+.Ldgemm_kernel_L4_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L4_Mv2_100
+
+	.align 5
+.Ldgemm_kernel_L4_Mv2_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L4_Mv2_46
+
+.Ldgemm_kernel_L4_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [pA2]
+	prfm	PLDL1KEEP, [pA2, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x4
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			    // 
+
+.Ldgemm_kernel_L4_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Ldgemm_kernel_L4_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Ldgemm_kernel_L4_END
+
+//////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
+.Ldgemm_kernel_L4_Mv1_BEGIN:
+
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L4_Mv1_22
+
+.Ldgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L4_Mv1_46
+
+.Ldgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Ldgemm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldgemm_kernel_L4_Mv1_20   
+
+
+.Ldgemm_kernel_L4_END:
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp	// B = B + K * 4 * 8
+
+/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
+
+	.align 5
+.Ldgemm_kernel_L2_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #2
+	ble	.Ldgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #1 // add 2 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Ldgemm_kernel_L2_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2
+    blt .Ldgemm_kernel_L2_Mv1_BEGIN
+
+    mov counterI, origM
+
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			// pA1 = start of A array
+
+	.align 5
+.Ldgemm_kernel_L2_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x2                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L2_Mv2_44
+
+	.align 5
+.Ldgemm_kernel_L2_Mv2_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L2_Mv2_22
+
+.Ldgemm_kernel_L2_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L2_Mv2_100
+
+	.align 5
+.Ldgemm_kernel_L2_Mv2_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L2_Mv2_46
+
+.Ldgemm_kernel_L2_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [pA2]
+	prfm	PLDL1KEEP, [pA2, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x2
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			    // 
+
+.Ldgemm_kernel_L2_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Ldgemm_kernel_L2_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Ldgemm_kernel_L2_END
+
+
+//////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
+.Ldgemm_kernel_L2_Mv1_BEGIN:
+
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x2                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L2_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L2_Mv1_22
+
+.Ldgemm_kernel_L2_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L2_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L2_Mv1_46
+
+.Ldgemm_kernel_L2_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x2
+
+.Ldgemm_kernel_L2_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldgemm_kernel_L2_Mv1_20   
+
+
+.Ldgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
+
+	.align 5
+.Ldgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Ldgemm_kernel_L999 // done
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC // add 1 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Ldgemm_kernel_L1_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2
+    blt .Ldgemm_kernel_L1_Mv1_BEGIN
+
+    mov counterI, origM
+
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			// pA1 = start of A array
+
+
+	.align 5
+.Ldgemm_kernel_L1_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x1                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Ldgemm_kernel_L1_Mv2_44
+
+	.align 5
+.Ldgemm_kernel_L1_Mv2_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv2_22
+
+.Ldgemm_kernel_L1_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L1_Mv2_100
+
+	.align 5
+.Ldgemm_kernel_L1_Mv2_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv2_46
+
+.Ldgemm_kernel_L1_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x1
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #3			    // 
+
+.Ldgemm_kernel_L1_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Ldgemm_kernel_L1_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Ldgemm_kernel_L1_END
+
+
+//////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
+.Ldgemm_kernel_L1_Mv1_BEGIN:
+
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x1                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Ldgemm_kernel_L1_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv1_22
+
+.Ldgemm_kernel_L1_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L1_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv1_46
+
+.Ldgemm_kernel_L1_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x1
+
+.Ldgemm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldgemm_kernel_L1_Mv1_20   
+
+
+.Ldgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Ldgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..1f812c775
--- /dev/null
+++ b/kernel/arm64/dgemm_ncopy_sve_v1.c
@@ -0,0 +1,79 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint64_t lda_vec = svindex_s64(0LL, lda);
+    uint64_t sve_size = svcntd();
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
+            svst1_f64(pg, (double *) boffset, a_vec);
+            aoffset1++;
+            boffset += active;
+        }
+        aoffset += sve_size * lda;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S
index 9ab51ff57..7e5bf6080 100644
--- a/kernel/arm64/dgemm_tcopy_8.S
+++ b/kernel/arm64/dgemm_tcopy_8.S
@@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	B03		x16
 #define	B04		x17
 
-#define I		x18
-#define	J		x19
+#define I		x19
+#define	J		x20
 
-#define TEMP1		x20
-#define TEMP2		x21
+#define TEMP1		x21
 
 #define A_PREFETCH	2560
 #define B_PREFETCH	256
diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..cb645a1b6
--- /dev/null
+++ b/kernel/arm64/dgemm_tcopy_sve_v1.c
@@ -0,0 +1,77 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    uint64_t sve_size = svcntd();
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
+            svst1_f64(pg, (double *) boffset, a_vec);
+            aoffset1 += lda;
+            boffset += active;
+        }
+        aoffset += sve_size;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
index 0ac5a5f24..3d953266c 100644
--- a/kernel/arm64/dtrmm_kernel_8x4.S
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alpha		x17
-#define temp		x18
+//#define temp		x18
 #define tempOffset	x19
 #define tempK		x20
+#define temp		x21
 
 #define alpha0		d10
 #define alphaV0		v10.d[0]
diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S
new file mode 100644
index 000000000..1f8c9b20f
--- /dev/null
+++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S
@@ -0,0 +1,1008 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+
+#define lanes		x15
+#define pA		x16
+#define alpha		x17
+//#define temp		x18
+#define tempOffset	x19
+#define tempK		x20
+#define temp		x21
+
+#define alpha0		d10
+#define alphaZ		z2.d
+
+#define A_PRE_SIZE	1536
+#define B_PRE_SIZE	512
+#define C_PRE_SIZE	128
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 lanes
+// 16 pA
+// 17 
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0
+//v01 pA0_1
+//v02 ALPHA0
+//v03 
+//v04 
+//v05 
+//v06 
+//v07 
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 
+//v11 must save pB0_3
+//v12 must save pB0_4
+//v13 must save pB0_5
+//v14 must save pB0_6
+//v15 must save pB0_7
+//v16 must save C0
+//v17 must save C1
+//v18 must save C2
+//v19 must save C3
+//v20 must save C4
+//v21 must save C5
+//v22 must save C6
+//v23 must save C7
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x8
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+    dup         z20.d, #0
+    dup         z21.d, #0
+    dup         z22.d, #0
+    dup         z23.d, #0
+.endm
+
+.macro KERNELv1x8_I
+    ld1d  z0.d, p1/z, [pA] 
+    ld1d  z1.d, p1/z, [pA, lanes, lsl #3]   // next one
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z0.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z0.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_M1
+    ld1d  z1.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z0.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z0.d, z12.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    fmla z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z0.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z0.d, z15.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_M2
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    fmla z16.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+    fmla z17.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    fmla z18.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z, [pB, 16]
+    fmla z19.d, p1/m, z1.d, z11.d
+    ld1rd  z11.d, p0/z, [pB, 24]
+    fmla z20.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z, [pB, 32]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z, [pB, 40]
+    fmla z22.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z, [pB, 48]
+    fmla z23.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+.endm
+
+.macro KERNELv1x8_E
+    fmla z16.d, p1/m, z1.d, z8.d
+    fmla z17.d, p1/m, z1.d, z9.d
+    fmla z18.d, p1/m, z1.d, z10.d
+    fmla z19.d, p1/m, z1.d, z11.d
+    fmla z20.d, p1/m, z1.d, z12.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.d, p1/m, z1.d, z13.d
+    fmla z22.d, p1/m, z1.d, z14.d
+    fmla z23.d, p1/m, z1.d, z15.d
+.endm
+
+.macro KERNELv1x8_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    fmla z17.d, p1/m, z0.d, z9.d
+    fmla z18.d, p1/m, z0.d, z10.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z19.d, p1/m, z0.d, z11.d
+    fmla z20.d, p1/m, z0.d, z12.d
+    fmla z21.d, p1/m, z0.d, z13.d
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z22.d, p1/m, z0.d, z14.d
+    fmla z23.d, p1/m, z0.d, z15.d
+
+.endm
+
+.macro SAVEv1x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    fmul z16.d, p1/m, z16.d, alphaZ
+    st1d  z16.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z17.d, p1/m, z17.d, alphaZ
+    st1d  z17.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z18.d, p1/m, z18.d, alphaZ
+    st1d z18.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z19.d, p1/m, z19.d, alphaZ
+    st1d  z19.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z20.d, p1/m, z20.d, alphaZ
+    st1d  z20.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z21.d, p1/m, z21.d, alphaZ
+    st1d  z21.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z22.d, p1/m, z22.d, alphaZ
+    st1d  z22.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    fmul z23.d, p1/m, z23.d, alphaZ
+    st1d  z23.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x4
+    dup         z16.d, #0
+    dup         z17.d, #0
+    dup         z18.d, #0
+    dup         z19.d, #0
+.endm
+
+.macro KERNELv1x4_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+
+    add pB, pB, 32
+
+    fmla z16.d, p1/m, z0.d, z8.d
+    fmla z17.d, p1/m, z0.d, z9.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z18.d, p1/m, z0.d, z10.d
+    fmla z19.d, p1/m, z0.d, z11.d
+
+.endm
+
+.macro SAVEv1x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    fmul z16.d, p1/m, z16.d, alphaZ
+    st1d  z16.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z17.d, p1/m, z17.d, alphaZ
+    st1d  z17.d, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z18.d, p1/m, z18.d, alphaZ
+    st1d z18.d, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    fmul z19.d, p1/m, z19.d, alphaZ
+    st1d  z19.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x2
+    dup         z16.d, #0
+    dup         z17.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+    fmla z16.d, p1/m, z0.d, z8.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z17.d, p1/m, z0.d, z9.d
+
+.endm
+
+.macro SAVEv1x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    fmul z16.d, p1/m, z16.d, alphaZ
+    st1d  z16.d, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    fmul z17.d, p1/m, z17.d, alphaZ
+    st1d  z17.d, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x1
+    dup         z16.d, #0
+.endm
+
+.macro KERNELv1x1_SUB
+    ld1d  z0.d, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+
+    add pB, pB, 8
+
+    fmla z16.d, p1/m, z0.d, z8.d
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+.endm
+
+.macro SAVEv1x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+    fmul z16.d, p1/m, z16.d, alphaZ
+    st1d  z16.d, p1, [pCRow0]
+
+
+	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 8
+
+.endm
+
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alpha, d0
+	dup	alphaZ, alpha
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+    ptrue p0.d                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Ldtrmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
+
+	.align 5
+.Ldtrmm_kernel_L8_BEGIN:
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #3 // add 8 x LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldtrmm_kernel_L8_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM      
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Ldtrmm_kernel_L8_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+    INITv1x8                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Ldtrmm_kernel_L8_Mv1_32
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Ldtrmm_kernel_L8_Mv1_22a
+
+	.align 5
+.Ldtrmm_kernel_L8_Mv1_22:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Ldtrmm_kernel_L8_Mv1_22
+
+	.align 5
+.Ldtrmm_kernel_L8_Mv1_22a:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+	b	 .Ldtrmm_kernel_L8_Mv1_44
+
+	.align 5
+.Ldtrmm_kernel_L8_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Ldtrmm_kernel_L8_Mv1_40
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+
+	b	.Ldtrmm_kernel_L8_Mv1_44
+
+.Ldtrmm_kernel_L8_Mv1_40:
+
+	INITv1x8
+
+.Ldtrmm_kernel_L8_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Ldtrmm_kernel_L8_Mv1_100
+
+	.align 5
+.Ldtrmm_kernel_L8_Mv1_46:
+
+	KERNELv1x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldtrmm_kernel_L8_Mv1_46
+
+.Ldtrmm_kernel_L8_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #8
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Ldtrmm_kernel_L8_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldtrmm_kernel_L8_Mv1_20   
+
+.Ldtrmm_kernel_L8_END:
+
+	lsl	temp, origK, #6 
+	add	origPB, origPB, temp		// B = B + K * 8 * 8
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Ldtrmm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
+
+	.align 5
+.Ldtrmm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #4
+	ble	.Ldtrmm_kernel_L2_BEGIN
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #2 // add 4 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldtrmm_kernel_L4_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldtrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldtrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Ldtrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldtrmm_kernel_L4_Mv1_22
+
+.Ldtrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Ldtrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Ldtrmm_kernel_L4_Mv1_46:
+
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldtrmm_kernel_L4_Mv1_46
+
+.Ldtrmm_kernel_L4_Mv1_100:
+
+	SAVEv1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Ldtrmm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldtrmm_kernel_L4_Mv1_20   
+
+
+.Ldtrmm_kernel_L4_END:
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp	// B = B + K * 4 * 8
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
+
+	.align 5
+.Ldtrmm_kernel_L2_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #2
+	ble	.Ldtrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #1 // add 2 x LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldtrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldtrmm_kernel_L2_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+    INITv1x2                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldtrmm_kernel_L2_Mv1_44
+
+	.align 5
+.Ldtrmm_kernel_L2_Mv1_22:
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldtrmm_kernel_L2_Mv1_22
+
+.Ldtrmm_kernel_L2_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Ldtrmm_kernel_L2_Mv1_100
+
+	.align 5
+.Ldtrmm_kernel_L2_Mv1_46:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldtrmm_kernel_L2_Mv1_46
+
+.Ldtrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+
+.Ldtrmm_kernel_L2_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldtrmm_kernel_L2_Mv1_20   
+
+
+.Ldtrmm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
+
+	.align 5
+.Ldtrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Ldtrmm_kernel_L999 // done
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC // add 1 x LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldtrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+	.align 5
+.Ldtrmm_kernel_L1_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+    INITv1x1                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Ldtrmm_kernel_L1_Mv1_44
+
+	.align 5
+.Ldtrmm_kernel_L1_Mv1_22:
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldtrmm_kernel_L1_Mv1_22
+
+.Ldtrmm_kernel_L1_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Ldtrmm_kernel_L1_Mv1_100
+
+	.align 5
+.Ldtrmm_kernel_L1_Mv1_46:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldtrmm_kernel_L1_Mv1_46
+
+.Ldtrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #3    // add tempOffset*lanes*8
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+
+
+.Ldtrmm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Ldtrmm_kernel_L1_Mv1_20   
+
+
+.Ldtrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Ldtrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
index b94f0cffc..fba2fe8ce 100644
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
 #define CUR_MAXINV	"d8"
 #define CUR_MAXINV_V	"v8.2d"
 #define CUR_MAX_V	"v8.2d"
+#define REGINF		"d9"
 
 static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 		         double *ssq, double *scale)
@@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ble	9f //nrm2_kernel_L999			\n"
 
 	"1: //nrm2_kernel_F_BEGIN:				\n"
+	"	mov	x6, #0x7FF0000000000000 //+Infinity	\n"
 	"	fmov	"REGZERO", xzr				\n"
 	"	fmov	"REGONE", #1.0				\n"
+	"	fmov	"REGINF", x6				\n"
 	"	lsl	"INC_X", "INC_X", #"INC_SHIFT"		\n"
 	"	mov	"J", "N"				\n"
 	"	cmp	"J", xzr				\n"
@@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d4, ["X"]				\n"
 	"	fabs	d4, d4					\n"
 	"	fmax	"CUR_MAX", "SCALE", d4			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d3, ["X", #8]				\n"
 	"	fabs	d3, d3					\n"
 	"	fmax	"CUR_MAX", "SCALE", d3			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	fmaxp	v24.2d, v24.2d, v26.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v24.2d			\n"
 	"	fmax	"CUR_MAX", "SCALE", d24			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"CUR_MAXINV", "REGONE", "CUR_MAX"	\n"
 	"	//dup	"CUR_MAX_V", v7.d[0]			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
@@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	fmaxp	v24.2d, v24.2d, v26.2d			\n"
 	"	fmaxp	v24.2d, v24.2d, v24.2d			\n"
 	"	fmax	"CUR_MAX", "SCALE", d24			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"CUR_MAXINV", "REGONE", "CUR_MAX"	\n"
 	"	//dup	"CUR_MAX_V", v7.d[0]			\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
@@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d4, ["X"]				\n"
 	"	fabs	d4, d4					\n"
 	"	fmax	"CUR_MAX", "SCALE", d4			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"	ldr	d3, ["X", #8]				\n"
 	"	fabs	d3, d3					\n"
 	"	fmax	"CUR_MAX", "SCALE", d3			\n"
+	"	fcmp	"CUR_MAX", "REGINF"			\n"
+	"	beq	10f					\n"
 	"	fdiv	"SCALE", "SCALE", "CUR_MAX"		\n"
 	"	fmul	"SCALE", "SCALE", "SCALE"		\n"
 	"	fmul	"SSQ", "SSQ", "SCALE"			\n"
@@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	"9: //nrm2_kernel_L999:					\n"
 	"	str	"SSQ", [%[SSQ_]]			\n"
 	"	str	"SCALE", [%[SCALE_]]			\n"
+	"	b	11f					\n"
+	"10:							\n"
+	"	str	"REGINF", [%[SSQ_]]			\n"
+	"	str	"REGINF", [%[SCALE_]]			\n"
+	"11:							\n"
 
 	:
 	: [SSQ_]    "r"  (ssq),			//%0
@@ -300,8 +320,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
 	  [INCX_]   "r"  (inc_x)		//%4
 	: "cc",
 	  "memory",
-	  "x0", "x1", "x2", "x3", "x4", "x5",
-	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
+	  "x0", "x1", "x2", "x3", "x4", "x5", "x6",
+	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF
 	);
 
 }
@@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 			cur_ssq = *ptr;
 			cur_scale = *(ptr + 1);
 
+			if (cur_ssq == INFINITY) {
+				ssq = INFINITY;
+				scale = INFINITY;
+				break;
+			}
+
 			if (cur_scale != 0) {
 				if (cur_scale > scale) {
 					scale = (scale / cur_scale);
diff --git a/kernel/arm64/sgemm_kernel_sve_v1x8.S b/kernel/arm64/sgemm_kernel_sve_v1x8.S
new file mode 100644
index 000000000..88c74bc0f
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_sve_v1x8.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+
+#define lanes		x15
+#define pA		x16
+#define alpha		w17
+
+#define alpha0		s10
+#define alphaZ		z2.s
+
+#define A_PRE_SIZE	1536
+#define B_PRE_SIZE	512
+#define C_PRE_SIZE	128
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 lanes
+// 16 pA
+// 17 
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0
+//v01 pA0_1
+//v02 ALPHA0
+//v03 
+//v04 
+//v05 
+//v06 
+//v07 
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 
+//v11 must save pB0_3
+//v12 must save pB0_4
+//v13 must save pB0_5
+//v14 must save pB0_6
+//v15 must save pB0_7
+//v16 must save C0
+//v17 must save C1
+//v18 must save C2
+//v19 must save C3
+//v20 must save C4
+//v21 must save C5
+//v22 must save C6
+//v23 must save C7
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x8
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+    dup         z20.s, #0
+    dup         z21.s, #0
+    dup         z22.s, #0
+    dup         z23.s, #0
+.endm
+
+.macro KERNELv1x8_I
+    ld1w  z0.s, p1/z, [pA] 
+    ld1w  z1.s, p1/z, [pA, lanes, lsl #2]   // next one
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z0.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z0.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_M1
+    ld1w  z1.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z0.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z0.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_M2
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    fmla z16.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z1.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z, [pB, 16]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_E
+    fmla z16.s, p1/m, z1.s, z8.s
+    fmla z17.s, p1/m, z1.s, z9.s
+    fmla z18.s, p1/m, z1.s, z10.s
+    fmla z19.s, p1/m, z1.s, z11.s
+    fmla z20.s, p1/m, z1.s, z12.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.s, p1/m, z1.s, z13.s
+    fmla z22.s, p1/m, z1.s, z14.s
+    fmla z23.s, p1/m, z1.s, z15.s
+.endm
+
+.macro KERNELv1x8_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    fmla z17.s, p1/m, z0.s, z9.s
+    fmla z18.s, p1/m, z0.s, z10.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z19.s, p1/m, z0.s, z11.s
+    fmla z20.s, p1/m, z0.s, z12.s
+    fmla z21.s, p1/m, z0.s, z13.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z22.s, p1/m, z0.s, z14.s
+    fmla z23.s, p1/m, z0.s, z15.s
+
+.endm
+
+.macro SAVEv1x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z25.s, p1/z, [pCRow1] 
+    fmla z25.s, p1/m, z17.s, alphaZ
+    st1w  z25.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z26.s, p1/z, [pCRow2] 
+    fmla z26.s, p1/m, z18.s, alphaZ
+    st1w z26.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z27.s, p1/z, [pCRow1] 
+    fmla z27.s, p1/m, z19.s, alphaZ
+    st1w  z27.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z28.s, p1/z, [pCRow2] 
+    fmla z28.s, p1/m, z20.s, alphaZ
+    st1w  z28.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z29.s, p1/z, [pCRow1] 
+    fmla z29.s, p1/m, z21.s, alphaZ
+    st1w  z29.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z30.s, p1/z, [pCRow2] 
+    fmla z30.s, p1/m, z22.s, alphaZ
+    st1w  z30.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z31.s, p1/z, [pCRow1] 
+    fmla z31.s, p1/m, z23.s, alphaZ
+    st1w  z31.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x4
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+.endm
+
+.macro KERNELv1x4_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+
+    add pB, pB, 16
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    fmla z17.s, p1/m, z0.s, z9.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z18.s, p1/m, z0.s, z10.s
+    fmla z19.s, p1/m, z0.s, z11.s
+
+.endm
+
+.macro SAVEv1x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z25.s, p1/z, [pCRow1] 
+    fmla z25.s, p1/m, z17.s, alphaZ
+    st1w  z25.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z26.s, p1/z, [pCRow2] 
+    fmla z26.s, p1/m, z18.s, alphaZ
+    st1w z26.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z27.s, p1/z, [pCRow1] 
+    fmla z27.s, p1/m, z19.s, alphaZ
+    st1w  z27.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x2
+    dup         z16.s, #0
+    dup         z17.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+    fmla z16.s, p1/m, z0.s, z8.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z17.s, p1/m, z0.s, z9.s
+
+.endm
+
+.macro SAVEv1x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z25.s, p1/z, [pCRow1] 
+    fmla z25.s, p1/m, z17.s, alphaZ
+    st1w  z25.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x1
+    dup         z16.s, #0
+.endm
+
+.macro KERNELv1x1_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 8
+
+    ld1rw  z8.s, p0/z,  [pB]
+
+    add pB, pB, 4
+
+    fmla z16.s, p1/m, z0.s, z8.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+.endm
+
+.macro SAVEv1x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alpha, s0
+	dup	alphaZ, alpha
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+    ptrue p0.s                  // create true predicate 
+
+	mov	pB, origPB
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Ldgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
+
+	.align 5
+.Ldgemm_kernel_L8_BEGIN:
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #3 // add 8 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L8_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM   
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x8                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Ldgemm_kernel_L8_Mv1_32
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Ldgemm_kernel_L8_Mv1_22a
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_22:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L8_Mv1_22
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_22a:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+	b	 .Ldgemm_kernel_L8_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Ldgemm_kernel_L8_Mv1_40
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+
+	b	.Ldgemm_kernel_L8_Mv1_44
+
+.Ldgemm_kernel_L8_Mv1_40:
+
+	INITv1x8
+
+.Ldgemm_kernel_L8_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L8_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L8_Mv1_46:
+
+	KERNELv1x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L8_Mv1_46
+
+.Ldgemm_kernel_L8_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x8
+
+.Ldgemm_kernel_L8_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Ldgemm_kernel_L8_Mv1_20   
+
+.Ldgemm_kernel_L8_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 8 * 4
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Ldgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
+
+	.align 5
+.Ldgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #4
+	ble	.Ldgemm_kernel_L2_BEGIN
+
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #2 // add 4 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L4_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L4_Mv1_22
+
+.Ldgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L4_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L4_Mv1_46
+
+.Ldgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Ldgemm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Ldgemm_kernel_L4_Mv1_20   
+
+
+.Ldgemm_kernel_L4_END:
+	lsl	temp, origK, #4 
+	add	origPB, origPB, temp	// B = B + K * 4 * 4
+
+/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
+
+	.align 5
+.Ldgemm_kernel_L2_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #2
+	ble	.Ldgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #1 // add 2 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x2                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Ldgemm_kernel_L2_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L2_Mv1_22
+
+.Ldgemm_kernel_L2_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L2_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L2_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Ldgemm_kernel_L2_Mv1_46
+
+.Ldgemm_kernel_L2_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x2
+
+.Ldgemm_kernel_L2_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Ldgemm_kernel_L2_Mv1_20   
+
+
+.Ldgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
+
+	.align 5
+.Ldgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Ldgemm_kernel_L999 // done
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC // add 1 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Ldgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x1                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Ldgemm_kernel_L1_Mv1_44
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv1_22
+
+.Ldgemm_kernel_L1_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Ldgemm_kernel_L1_Mv1_100
+
+	.align 5
+.Ldgemm_kernel_L1_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Ldgemm_kernel_L1_Mv1_46
+
+.Ldgemm_kernel_L1_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x1
+
+.Ldgemm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Ldgemm_kernel_L1_Mv1_20   
+
+
+.Ldgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Ldgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S
new file mode 100644
index 000000000..1cdd8253e
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S
@@ -0,0 +1,1683 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/* This is an SVE sgemm kernel with size 2*SVE_LEN x 8.
+However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
+This means that we sweep two panels of packed A when iterating in a loop over K.
+With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+
+#define lanes		x15
+#define pA1	    	x16
+#define pA2	    	x17
+#define alpha		w18
+#define vec_len		x19
+#define vec_lenx2   x20
+
+#define alpha0		s10
+#define alphaZ		z7.s
+
+#define A_PRE_SIZE	1536
+#define B_PRE_SIZE	512
+#define C_PRE_SIZE	128
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 lanes
+// 16 pA1
+// 17 pA1
+// 18 must save alpha
+// 19 must save vec_len
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA10_0
+//v01 pA10_1
+//v02 pA20_0
+//v03 pA20_1
+//v04 
+//v05 
+//v06 
+//v07 ALPHA0
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 
+//v11 must save pB0_3
+//v12 must save pB0_4
+//v13 must save pB0_5
+//v14 must save pB0_6
+//v15 must save pB0_7
+//v16 must save C0
+//v17 must save C1
+//v18 must save C2
+//v19 must save C3
+//v20 must save C4
+//v21 must save C5
+//v22 must save C6
+//v23 must save C7
+//v24 must save C8
+//v25 must save C9
+//v26 must save C10
+//v27 must save C11
+//v28 must save C12
+//v29 must save C13
+//v30 must save C14
+//v31 must save C15
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv2x8
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+    dup         z20.s, #0
+    dup         z21.s, #0
+    dup         z22.s, #0
+    dup         z23.s, #0
+    dup         z24.s, #0
+    dup         z25.s, #0
+    dup         z26.s, #0
+    dup         z27.s, #0
+    dup         z28.s, #0
+    dup         z29.s, #0
+    dup         z30.s, #0
+    dup         z31.s, #0
+.endm
+
+.macro KERNELv2x8_I
+    ld1w  z0.s, p0/z, [pA1] 
+    ld1w  z1.s, p0/z, [pA2]   
+    ld1w  z2.s, p0/z, [pA1, vec_len, lsl #2] 
+    ld1w  z3.s, p0/z, [pA2, vec_len, lsl #2]   
+	add	pA1, pA1, vec_len, lsl #3	// pA1 = pA1 + vec_len * 4 *2
+	add	pA2, pA2, vec_len, lsl #3	// pA1 = pA1 + vec_len * 4 *2
+
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p0/m, z0.s, z8.s
+    fmla z17.s, p0/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z18.s, p0/m, z0.s, z9.s
+    fmla z19.s, p0/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z20.s, p0/m, z0.s, z10.s
+    fmla z21.s, p0/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z22.s, p0/m, z0.s, z11.s
+    fmla z23.s, p0/m, z1.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z24.s, p0/m, z0.s, z12.s
+    fmla z25.s, p0/m, z1.s, z12.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z26.s, p0/m, z0.s, z13.s
+    fmla z27.s, p0/m, z1.s, z13.s
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z28.s, p0/m, z0.s, z14.s
+    fmla z29.s, p0/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z30.s, p0/m, z0.s, z15.s
+    fmla z31.s, p0/m, z1.s, z15.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE+64]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv2x8_M1
+    ld1w  z2.s, p0/z, [pA1] 
+    ld1w  z3.s, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+	add	pA2, pA2, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+
+    fmla z16.s, p0/m, z0.s, z8.s
+    fmla z17.s, p0/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z18.s, p0/m, z0.s, z9.s
+    fmla z19.s, p0/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z20.s, p0/m, z0.s, z10.s
+    fmla z21.s, p0/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z22.s, p0/m, z0.s, z11.s
+    fmla z23.s, p0/m, z1.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z24.s, p0/m, z0.s, z12.s
+    fmla z25.s, p0/m, z1.s, z12.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z26.s, p0/m, z0.s, z13.s
+    fmla z27.s, p0/m, z1.s, z13.s
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z28.s, p0/m, z0.s, z14.s
+    fmla z29.s, p0/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z30.s, p0/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    fmla z31.s, p0/m, z1.s, z15.s
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv2x8_M2
+    ld1w  z0.s, p0/z, [pA1] 
+    ld1w  z1.s, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #2	// pA1 = pA1 + vec_len * 2 * 4
+	add	pA2, pA2, vec_len, lsl #2	// pA1 = pA1 + vec_len * 2 * 4
+
+    fmla z16.s, p0/m, z2.s, z8.s
+    fmla z17.s, p0/m, z3.s, z8.s 
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z18.s, p0/m, z2.s, z9.s
+    fmla z19.s, p0/m, z3.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z20.s, p0/m, z2.s, z10.s
+    fmla z21.s, p0/m, z3.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z22.s, p0/m, z2.s, z11.s
+    fmla z23.s, p0/m, z3.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z24.s, p0/m, z2.s, z12.s
+    fmla z25.s, p0/m, z3.s, z12.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z26.s, p0/m, z2.s, z13.s
+    fmla z27.s, p0/m, z3.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z28.s, p0/m, z2.s, z14.s
+    fmla z29.s, p0/m, z3.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z30.s, p0/m, z2.s, z15.s
+    fmla z31.s, p0/m, z3.s, z15.s
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv2x8_E
+    fmla z16.s, p0/m, z2.s, z8.s
+    fmla z17.s, p0/m, z3.s, z8.s
+    fmla z18.s, p0/m, z2.s, z9.s
+    fmla z19.s, p0/m, z3.s, z9.s
+    fmla z20.s, p0/m, z2.s, z10.s
+    fmla z21.s, p0/m, z3.s, z10.s
+    fmla z22.s, p0/m, z2.s, z11.s
+    fmla z23.s, p0/m, z3.s, z11.s
+    fmla z24.s, p0/m, z2.s, z12.s
+    fmla z25.s, p0/m, z3.s, z12.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z26.s, p0/m, z2.s, z13.s
+    fmla z27.s, p0/m, z3.s, z13.s
+    fmla z28.s, p0/m, z2.s, z14.s
+    fmla z29.s, p0/m, z3.s, z14.s
+    fmla z30.s, p0/m, z2.s, z15.s
+    fmla z31.s, p0/m, z3.s, z15.s
+.endm
+
+.macro KERNELv2x8_SUB
+    ld1w  z0.s, p0/z, [pA1] 
+    ld1w  z1.s, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+	add	pA2, pA2, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p0/m, z0.s, z8.s
+    fmla z17.s, p0/m, z1.s, z8.s
+    fmla z18.s, p0/m, z0.s, z9.s
+    fmla z19.s, p0/m, z1.s, z9.s
+    fmla z20.s, p0/m, z0.s, z10.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z21.s, p0/m, z1.s, z10.s
+    fmla z22.s, p0/m, z0.s, z11.s
+    fmla z23.s, p0/m, z1.s, z11.s
+    fmla z24.s, p0/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    fmla z25.s, p0/m, z1.s, z12.s
+    fmla z26.s, p0/m, z0.s, z13.s
+    fmla z27.s, p0/m, z1.s, z13.s
+    fmla z28.s, p0/m, z0.s, z14.s
+    fmla z29.s, p0/m, z1.s, z14.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z30.s, p0/m, z0.s, z15.s
+    fmla z31.s, p0/m, z1.s, z15.s
+.endm
+
+.macro SAVEv2x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z8.s, p0/z, [pCRow0] 
+    ld1w  z9.s, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.s, p0/m, z16.s, alphaZ
+    fmla z9.s, p0/m, z17.s, alphaZ
+    st1w  z8.s, p0, [pCRow0]
+    st1w  z9.s, p0, [pCRow0, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z10.s, p0/z, [pCRow1] 
+    ld1w  z11.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.s, p0/m, z18.s, alphaZ
+    fmla z11.s, p0/m, z19.s, alphaZ
+    st1w  z10.s, p0, [pCRow1]
+    st1w  z11.s, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z12.s, p0/z, [pCRow2] 
+    ld1w  z13.s, p0/z, [pCRow2, #1, mul vl] 
+    fmla z12.s, p0/m, z20.s, alphaZ
+    fmla z13.s, p0/m, z21.s, alphaZ
+    st1w  z12.s, p0, [pCRow2]
+    st1w  z13.s, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z14.s, p0/z, [pCRow1] 
+    ld1w  z15.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z14.s, p0/m, z22.s, alphaZ
+    fmla z15.s, p0/m, z23.s, alphaZ
+    st1w  z14.s, p0, [pCRow1]
+    st1w  z15.s, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z8.s, p0/z, [pCRow2] 
+    ld1w  z9.s, p0/z, [pCRow2, #1, mul vl] 
+    fmla z8.s, p0/m, z24.s, alphaZ
+    fmla z9.s, p0/m, z25.s, alphaZ
+    st1w  z8.s, p0, [pCRow2]
+    st1w  z9.s, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z10.s, p0/z, [pCRow1] 
+    ld1w  z11.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.s, p0/m, z26.s, alphaZ
+    fmla z11.s, p0/m, z27.s, alphaZ
+    st1w  z10.s, p0, [pCRow1]
+    st1w  z11.s, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z12.s, p0/z, [pCRow2] 
+    ld1w  z13.s, p0/z, [pCRow2, #1, mul vl] 
+    fmla z12.s, p0/m, z28.s, alphaZ
+    fmla z13.s, p0/m, z29.s, alphaZ
+    st1w  z12.s, p0, [pCRow2]
+    st1w  z13.s, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z14.s, p0/z, [pCRow1] 
+    ld1w  z15.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z14.s, p0/m, z30.s, alphaZ
+    fmla z15.s, p0/m, z31.s, alphaZ
+    st1w  z14.s, p0, [pCRow1]
+    st1w  z15.s, p0, [pCRow1, #1, mul vl]
+
+	add	pCRow0, pCRow0, vec_len, lsl #3	// pC = pC + vec_len  * 4 * 2
+
+.endm
+
+.macro INITv2x4
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+    dup         z20.s, #0
+    dup         z21.s, #0
+    dup         z22.s, #0
+    dup         z23.s, #0
+.endm
+
+.macro KERNELv2x4_SUB
+    ld1w  z0.s, p0/z, [pA1] 
+    ld1w  z1.s, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+	add	pA2, pA2, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+
+    add pB, pB, 16
+
+    fmla z16.s, p0/m, z0.s, z8.s
+    fmla z17.s, p0/m, z1.s, z8.s
+    fmla z18.s, p0/m, z0.s, z9.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z19.s, p0/m, z1.s, z9.s
+    fmla z20.s, p0/m, z0.s, z10.s
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+    fmla z21.s, p0/m, z1.s, z10.s
+    fmla z22.s, p0/m, z0.s, z11.s
+    fmla z23.s, p0/m, z1.s, z11.s
+.endm
+
+.macro SAVEv2x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z8.s, p0/z, [pCRow0] 
+    ld1w  z9.s, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.s, p0/m, z16.s, alphaZ
+    fmla z9.s, p0/m, z17.s, alphaZ
+    st1w  z8.s, p0, [pCRow0]
+    st1w  z9.s, p0, [pCRow0, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z10.s, p0/z, [pCRow1] 
+    ld1w  z11.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.s, p0/m, z18.s, alphaZ
+    fmla z11.s, p0/m, z19.s, alphaZ
+    st1w  z10.s, p0, [pCRow1]
+    st1w  z11.s, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z12.s, p0/z, [pCRow2] 
+    ld1w  z13.s, p0/z, [pCRow2, #1, mul vl] 
+    fmla z12.s, p0/m, z20.s, alphaZ
+    fmla z13.s, p0/m, z21.s, alphaZ
+    st1w  z12.s, p0, [pCRow2]
+    st1w  z13.s, p0, [pCRow2, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z14.s, p0/z, [pCRow1] 
+    ld1w  z15.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z14.s, p0/m, z22.s, alphaZ
+    fmla z15.s, p0/m, z23.s, alphaZ
+    st1w  z14.s, p0, [pCRow1]
+    st1w  z15.s, p0, [pCRow1, #1, mul vl]
+
+	add	pCRow0, pCRow0, vec_len, lsl #3	// pC = pC + vec_len  * 4 * 2
+
+.endm
+
+.macro INITv2x2
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+.endm
+
+.macro KERNELv2x2_SUB
+    ld1w  z0.s, p0/z, [pA1] 
+    ld1w  z1.s, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+	add	pA2, pA2, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+    fmla z16.s, p0/m, z0.s, z8.s
+    fmla z17.s, p0/m, z1.s, z8.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z18.s, p0/m, z0.s, z9.s
+    fmla z19.s, p0/m, z1.s, z9.s
+	prfm	PLDL1KEEP, [pA2, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv2x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z8.s, p0/z, [pCRow0] 
+    ld1w  z9.s, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.s, p0/m, z16.s, alphaZ
+    fmla z9.s, p0/m, z17.s, alphaZ
+    st1w  z8.s, p0, [pCRow0]
+    st1w  z9.s, p0, [pCRow0, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z10.s, p0/z, [pCRow1] 
+    ld1w  z11.s, p0/z, [pCRow1, #1, mul vl] 
+    fmla z10.s, p0/m, z18.s, alphaZ
+    fmla z11.s, p0/m, z19.s, alphaZ
+    st1w  z10.s, p0, [pCRow1]
+    st1w  z11.s, p0, [pCRow1, #1, mul vl]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+
+
+	add	pCRow0, pCRow0, vec_len, lsl #3	// pC = pC + vec_len  * 4 * 2
+.endm
+
+.macro INITv2x1
+    dup         z16.s, #0
+    dup         z17.s, #0
+.endm
+
+.macro KERNELv2x1_SUB
+    ld1w  z0.s, p0/z, [pA1] 
+    ld1w  z1.s, p0/z, [pA2]   
+	add	pA1, pA1, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+	add	pA2, pA2, vec_len, lsl #2	// pA1 = pA1 + vec_len * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+
+    add pB, pB, 4
+
+    fmla z16.s, p0/m, z0.s, z8.s
+    fmla z17.s, p0/m, z1.s, z8.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv2x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z8.s, p0/z, [pCRow0] 
+    ld1w  z9.s, p0/z, [pCRow0, #1, mul vl] 
+    fmla z8.s, p0/m, z16.s, alphaZ
+    fmla z9.s, p0/m, z17.s, alphaZ
+    st1w  z8.s, p0, [pCRow0]
+    st1w  z9.s, p0, [pCRow0, #1, mul vl]
+
+	add	pCRow0, pCRow0, vec_len, lsl #3	// pC = pC + vec_len  * 4 * 2
+
+.endm
+
+.macro INITv1x8
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+    dup         z20.s, #0
+    dup         z21.s, #0
+    dup         z22.s, #0
+    dup         z23.s, #0
+.endm
+
+.macro KERNELv1x8_I
+    ld1w  z0.s, p1/z, [pA1] 
+    ld1w  z1.s, p1/z, [pA1, lanes, lsl #2]   // next one
+	add	pA1, pA1, lanes, lsl #3	// pA1 = pA1 + lanes * 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z0.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z0.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_M1
+    ld1w  z1.s, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #2	// pA1 = pA1 + lanes  * 4
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z0.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z0.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_M2
+    ld1w  z0.s, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #2	// pA1 = pA1 + lanes  * 4
+
+    fmla z16.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z1.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z, [pB, 16]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_E
+    fmla z16.s, p1/m, z1.s, z8.s
+    fmla z17.s, p1/m, z1.s, z9.s
+    fmla z18.s, p1/m, z1.s, z10.s
+    fmla z19.s, p1/m, z1.s, z11.s
+    fmla z20.s, p1/m, z1.s, z12.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.s, p1/m, z1.s, z13.s
+    fmla z22.s, p1/m, z1.s, z14.s
+    fmla z23.s, p1/m, z1.s, z15.s
+.endm
+
+.macro KERNELv1x8_SUB
+    ld1w  z0.s, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #2	// pA1 = pA1 + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    fmla z17.s, p1/m, z0.s, z9.s
+    fmla z18.s, p1/m, z0.s, z10.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z19.s, p1/m, z0.s, z11.s
+    fmla z20.s, p1/m, z0.s, z12.s
+    fmla z21.s, p1/m, z0.s, z13.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z22.s, p1/m, z0.s, z14.s
+    fmla z23.s, p1/m, z0.s, z15.s
+
+
+.endm
+
+.macro SAVEv1x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z25.s, p1/z, [pCRow1] 
+    fmla z25.s, p1/m, z17.s, alphaZ
+    st1w  z25.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z26.s, p1/z, [pCRow2] 
+    fmla z26.s, p1/m, z18.s, alphaZ
+    st1w z26.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z27.s, p1/z, [pCRow1] 
+    fmla z27.s, p1/m, z19.s, alphaZ
+    st1w  z27.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z28.s, p1/z, [pCRow2] 
+    fmla z28.s, p1/m, z20.s, alphaZ
+    st1w  z28.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z29.s, p1/z, [pCRow1] 
+    fmla z29.s, p1/m, z21.s, alphaZ
+    st1w  z29.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z30.s, p1/z, [pCRow2] 
+    fmla z30.s, p1/m, z22.s, alphaZ
+    st1w  z30.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z31.s, p1/z, [pCRow1] 
+    fmla z31.s, p1/m, z23.s, alphaZ
+    st1w  z31.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x4
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+.endm
+
+.macro KERNELv1x4_SUB
+    ld1w  z0.s, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #2	// pA1 = pA1 + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+
+    add pB, pB, 16
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    fmla z17.s, p1/m, z0.s, z9.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z18.s, p1/m, z0.s, z10.s
+    fmla z19.s, p1/m, z0.s, z11.s
+
+.endm
+
+.macro SAVEv1x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    ld1w  z25.s, p1/z, [pCRow1] 
+    fmla z25.s, p1/m, z17.s, alphaZ
+    st1w  z25.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    ld1w  z26.s, p1/z, [pCRow2] 
+    fmla z26.s, p1/m, z18.s, alphaZ
+    st1w z26.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z27.s, p1/z, [pCRow1] 
+    fmla z27.s, p1/m, z19.s, alphaZ
+    st1w  z27.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x2
+    dup         z16.s, #0
+    dup         z17.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+    ld1w  z0.s, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #2	// pA1 = pA1 + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+    fmla z16.s, p1/m, z0.s, z8.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+    fmla z17.s, p1/m, z0.s, z9.s
+
+.endm
+
+.macro SAVEv1x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    ld1w  z25.s, p1/z, [pCRow1] 
+    fmla z25.s, p1/m, z17.s, alphaZ
+    st1w  z25.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x1
+    dup         z16.s, #0
+.endm
+
+.macro KERNELv1x1_SUB
+    ld1w  z0.s, p1/z, [pA1] 
+	add	pA1, pA1, lanes, lsl #2	// pA1 = pA1 + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+
+    add pB, pB, 4
+
+    fmla z16.s, p1/m, z0.s, z8.s
+	prfm	PLDL1KEEP, [pA1, #A_PRE_SIZE]
+
+.endm
+
+.macro SAVEv1x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+    ld1w  z24.s, p1/z, [pCRow0] 
+    fmla z24.s, p1/m, z16.s, alphaZ
+    st1w  z24.s, p1, [pCRow0]
+
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alpha, s0
+	dup	alphaZ, alpha
+    cntw vec_len
+    lsl vec_lenx2, vec_len, #1
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 8
+    ptrue p0.s                  // create true predicate 
+
+	mov	pB, origPB
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Lsgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
+
+	.align 5
+.Lsgemm_kernel_L8_BEGIN:
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #3 // add 8 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Lsgemm_kernel_L8_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2        // Check if M < 2*SVE_LEN
+    blt .Lsgemm_kernel_L8_Mv1_BEGIN
+
+    mov counterI, origM
+
+/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			// pA1 = start of A array
+	prfm	PLDL1KEEP, [pA2]
+
+	.align 5
+.Lsgemm_kernel_L8_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x8                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lsgemm_kernel_L8_Mv2_32
+
+	KERNELv2x8_I
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lsgemm_kernel_L8_Mv2_22a
+
+	.align 5
+.Lsgemm_kernel_L8_Mv2_22:
+
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_Mv2_22
+
+	.align 5
+.Lsgemm_kernel_L8_Mv2_22a:
+
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_E
+
+	b	 .Lsgemm_kernel_L8_Mv2_44
+
+	.align 5
+.Lsgemm_kernel_L8_Mv2_32:
+
+	tst	counterL, #1
+	ble	.Lsgemm_kernel_L8_Mv2_40
+
+	KERNELv2x8_I
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_M2
+	KERNELv2x8_M1
+	KERNELv2x8_E
+
+
+	b	.Lsgemm_kernel_L8_Mv2_44
+
+.Lsgemm_kernel_L8_Mv2_40:
+
+	INITv2x8
+
+.Lsgemm_kernel_L8_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L8_Mv2_100
+
+	.align 5
+.Lsgemm_kernel_L8_Mv2_46:
+
+	KERNELv2x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lsgemm_kernel_L8_Mv2_46
+
+.Lsgemm_kernel_L8_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [pA2]
+	prfm	PLDL1KEEP, [pA2, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x8
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			    // 
+
+.Lsgemm_kernel_L8_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Lsgemm_kernel_L8_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Lsgemm_kernel_L8_END
+
+//////////////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
+.Lsgemm_kernel_L8_Mv1_BEGIN:
+
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lsgemm_kernel_L8_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x8                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lsgemm_kernel_L8_Mv1_32
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lsgemm_kernel_L8_Mv1_22a
+
+	.align 5
+.Lsgemm_kernel_L8_Mv1_22:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L8_Mv1_22
+
+	.align 5
+.Lsgemm_kernel_L8_Mv1_22a:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+	b	 .Lsgemm_kernel_L8_Mv1_44
+
+	.align 5
+.Lsgemm_kernel_L8_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lsgemm_kernel_L8_Mv1_40
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+
+	b	.Lsgemm_kernel_L8_Mv1_44
+
+.Lsgemm_kernel_L8_Mv1_40:
+
+	INITv1x8
+
+.Lsgemm_kernel_L8_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L8_Mv1_100
+
+	.align 5
+.Lsgemm_kernel_L8_Mv1_46:
+
+	KERNELv1x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lsgemm_kernel_L8_Mv1_46
+
+.Lsgemm_kernel_L8_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x8
+
+.Lsgemm_kernel_L8_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lsgemm_kernel_L8_Mv1_20   
+
+.Lsgemm_kernel_L8_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 8 * 4
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lsgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
+
+	.align 5
+.Lsgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #4
+	ble	.Lsgemm_kernel_L2_BEGIN
+
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #2 // add 4 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Lsgemm_kernel_L4_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2
+    blt .Lsgemm_kernel_L4_Mv1_BEGIN
+
+    mov counterI, origM
+
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			// pA1 = start of A array
+
+	.align 5
+.Lsgemm_kernel_L4_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x4                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Lsgemm_kernel_L4_Mv2_44
+
+	.align 5
+.Lsgemm_kernel_L4_Mv2_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+	KERNELv2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_Mv2_22
+
+.Lsgemm_kernel_L4_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L4_Mv2_100
+
+	.align 5
+.Lsgemm_kernel_L4_Mv2_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lsgemm_kernel_L4_Mv2_46
+
+.Lsgemm_kernel_L4_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [pA2]
+	prfm	PLDL1KEEP, [pA2, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x4
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			    // 
+
+.Lsgemm_kernel_L4_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Lsgemm_kernel_L4_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Lsgemm_kernel_L4_END
+
+//////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
+.Lsgemm_kernel_L4_Mv1_BEGIN:
+
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lsgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Lsgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Lsgemm_kernel_L4_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L4_Mv1_22
+
+.Lsgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Lsgemm_kernel_L4_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lsgemm_kernel_L4_Mv1_46
+
+.Lsgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lsgemm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lsgemm_kernel_L4_Mv1_20   
+
+
+.Lsgemm_kernel_L4_END:
+	lsl	temp, origK, #4 
+	add	origPB, origPB, temp	// B = B + K * 4 * 4
+
+/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
+
+	.align 5
+.Lsgemm_kernel_L2_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #2
+	ble	.Lsgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #1 // add 2 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Lsgemm_kernel_L2_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2
+    blt .Lsgemm_kernel_L2_Mv1_BEGIN
+
+    mov counterI, origM
+
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			// pA1 = start of A array
+
+	.align 5
+.Lsgemm_kernel_L2_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x2                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Lsgemm_kernel_L2_Mv2_44
+
+	.align 5
+.Lsgemm_kernel_L2_Mv2_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+	KERNELv2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_Mv2_22
+
+.Lsgemm_kernel_L2_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L2_Mv2_100
+
+	.align 5
+.Lsgemm_kernel_L2_Mv2_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lsgemm_kernel_L2_Mv2_46
+
+.Lsgemm_kernel_L2_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [pA2]
+	prfm	PLDL1KEEP, [pA2, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x2
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			    // 
+
+.Lsgemm_kernel_L2_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Lsgemm_kernel_L2_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Lsgemm_kernel_L2_END
+
+
+//////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
+.Lsgemm_kernel_L2_Mv1_BEGIN:
+
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Lsgemm_kernel_L2_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x2                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Lsgemm_kernel_L2_Mv1_44
+
+	.align 5
+.Lsgemm_kernel_L2_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L2_Mv1_22
+
+.Lsgemm_kernel_L2_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L2_Mv1_100
+
+	.align 5
+.Lsgemm_kernel_L2_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lsgemm_kernel_L2_Mv1_46
+
+.Lsgemm_kernel_L2_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x2
+
+.Lsgemm_kernel_L2_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lsgemm_kernel_L2_Mv1_20   
+
+
+.Lsgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
+
+	.align 5
+.Lsgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lsgemm_kernel_L999 // done
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC // add 1 x LDC
+
+	mov	pA1, origPA			// pA1 = start of A array
+
+.Lsgemm_kernel_L1_Mv2_BEGIN:
+
+    mov counterI, #0
+    cmp origM, vec_lenx2
+    blt .Lsgemm_kernel_L1_Mv1_BEGIN
+
+    mov counterI, origM
+
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			// pA1 = start of A array
+
+
+	.align 5
+.Lsgemm_kernel_L1_Mv2_20:
+
+	mov	pB, origPB
+    INITv2x1                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Lsgemm_kernel_L1_Mv2_44
+
+	.align 5
+.Lsgemm_kernel_L1_Mv2_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+	KERNELv2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_Mv2_22
+
+.Lsgemm_kernel_L1_Mv2_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L1_Mv2_100
+
+	.align 5
+.Lsgemm_kernel_L1_Mv2_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_Mv2_46
+
+.Lsgemm_kernel_L1_Mv2_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv2x1
+    mov pA1, pA2                            // pA1 = pA2
+    mul temp, vec_len, origK                // generate address of pA2
+	add	pA2, pA1, temp, lsl #2			    // 
+
+.Lsgemm_kernel_L1_Mv2_END:
+    sub counterI, counterI, vec_lenx2
+    cmp counterI, vec_lenx2
+    bge .Lsgemm_kernel_L1_Mv2_20
+    sub counterI, origM, counterI
+
+    cmp counterI, origM
+    beq .Lsgemm_kernel_L1_END
+
+
+//////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
+.Lsgemm_kernel_L1_Mv1_BEGIN:
+
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Lsgemm_kernel_L1_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x1                     // fill with zeros
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Lsgemm_kernel_L1_Mv1_44
+
+	.align 5
+.Lsgemm_kernel_L1_Mv1_22:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_Mv1_22
+
+.Lsgemm_kernel_L1_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lsgemm_kernel_L1_Mv1_100
+
+	.align 5
+.Lsgemm_kernel_L1_Mv1_46:
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lsgemm_kernel_L1_Mv1_46
+
+.Lsgemm_kernel_L1_Mv1_100:
+	prfm	PLDL1KEEP, [pA1]
+	prfm	PLDL1KEEP, [pA1, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x1
+
+.Lsgemm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lsgemm_kernel_L1_Mv1_20   
+
+
+.Lsgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lsgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..1bc186335
--- /dev/null
+++ b/kernel/arm64/sgemm_ncopy_sve_v1.c
@@ -0,0 +1,78 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint32_t lda_vec = svindex_s32(0LL, lda);
+    uint32_t sve_size = svcntw();
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
+            svst1_f32(pg, (float *) boffset, a_vec);
+            aoffset1++;
+            boffset += active;
+        }
+        aoffset += sve_size * lda;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
index 12b80bdca..431f1ae2a 100644
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@@ -30,7 +30,7 @@ All rights reserved.
 #define	B00		x22
 
 
-#define I		x18
+#define I		x21
 #define	J		x19
 
 #define TEMP1		x20
@@ -270,11 +270,6 @@ All rights reserved.
 	ldr	s1, [A02]
 	ldr	s2, [A03]
 	ldr	s3, [A04]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-	add	A03, A03, #4
-	add	A04, A04, #4
 
 	stp	s0, s1, [B04]
 	add	B04, B04, #8
@@ -285,11 +280,6 @@ All rights reserved.
 	ldr	s5, [A06]
 	ldr	s6, [A07]
 	ldr	s7, [A08]
-	
-	ldr	d4, [A05], #8
-	ldr	d5, [A06], #8
-	ldr	d6, [A07], #8
-	ldr	d7, [A08], #8
 
 	stp	s4, s5, [B04]
 	add	B04, B04, #8
diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..9f8cf502a
--- /dev/null
+++ b/kernel/arm64/sgemm_tcopy_sve_v1.c
@@ -0,0 +1,77 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    uint32_t sve_size = svcntw();
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b32(j, n);
+    uint32_t active = svcntp_b32(svptrue_b32(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint32_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
+            svst1_f32(pg, (float *) boffset, a_vec);
+            aoffset1 += lda;
+            boffset += active;
+        }
+        aoffset += sve_size;
+
+        j += svcntw();
+        pg = svwhilelt_b32(j, n);
+        active = svcntp_b32(svptrue_b32(), pg);
+
+    } while (svptest_any(svptrue_b32(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
index 985a0a9a6..a44326aeb 100644
--- a/kernel/arm64/strmm_kernel_16x4.S
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alpha		w17
-#define temp		x18
+//#define temp		x18
 #define tempOffset	x19
 #define tempK		x20
+#define temp            x21
 
 #define alpha0		s10
 #define alphaV0		v10.s[0]
diff --git a/kernel/arm64/strmm_kernel_sve_v1x8.S b/kernel/arm64/strmm_kernel_sve_v1x8.S
new file mode 100644
index 000000000..3c45e3e29
--- /dev/null
+++ b/kernel/arm64/strmm_kernel_sve_v1x8.S
@@ -0,0 +1,1008 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+
+#define lanes		x15
+#define pA		x16
+#define alpha		w17
+//#define temp		x18
+#define tempOffset	x19
+#define tempK		x20
+#define temp		x21
+
+#define alpha0		s10
+#define alphaZ		z2.s
+
+#define A_PRE_SIZE	1536
+#define B_PRE_SIZE	512
+#define C_PRE_SIZE	128
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 lanes
+// 16 pA
+// 17 
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0
+//v01 pA0_1
+//v02 ALPHA0
+//v03 
+//v04 
+//v05 
+//v06 
+//v07 
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 
+//v11 must save pB0_3
+//v12 must save pB0_4
+//v13 must save pB0_5
+//v14 must save pB0_6
+//v15 must save pB0_7
+//v16 must save C0
+//v17 must save C1
+//v18 must save C2
+//v19 must save C3
+//v20 must save C4
+//v21 must save C5
+//v22 must save C6
+//v23 must save C7
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x8
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+    dup         z20.s, #0
+    dup         z21.s, #0
+    dup         z22.s, #0
+    dup         z23.s, #0
+.endm
+
+.macro KERNELv1x8_I
+    ld1w  z0.s, p1/z, [pA] 
+    ld1w  z1.s, p1/z, [pA, lanes, lsl #2]   // next one
+	add	pA, pA, lanes, lsl #3	// pA = pA + lanes * 2 * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z0.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z0.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_M1
+    ld1w  z1.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z0.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z0.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z0.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z0.s, z12.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    fmla z21.s, p1/m, z0.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z0.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z0.s, z15.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_M2
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    fmla z16.s, p1/m, z1.s, z8.s
+    ld1rw  z8.s, p0/z,  [pB]
+    fmla z17.s, p1/m, z1.s, z9.s
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    fmla z18.s, p1/m, z1.s, z10.s
+    ld1rw  z10.s, p0/z, [pB, 8]
+    fmla z19.s, p1/m, z1.s, z11.s
+    ld1rw  z11.s, p0/z, [pB, 12]
+    fmla z20.s, p1/m, z1.s, z12.s
+    ld1rw  z12.s, p0/z, [pB, 16]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.s, p1/m, z1.s, z13.s
+    ld1rw  z13.s, p0/z, [pB, 20]
+    fmla z22.s, p1/m, z1.s, z14.s
+    ld1rw  z14.s, p0/z, [pB, 24]
+    fmla z23.s, p1/m, z1.s, z15.s
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+.endm
+
+.macro KERNELv1x8_E
+    fmla z16.s, p1/m, z1.s, z8.s
+    fmla z17.s, p1/m, z1.s, z9.s
+    fmla z18.s, p1/m, z1.s, z10.s
+    fmla z19.s, p1/m, z1.s, z11.s
+    fmla z20.s, p1/m, z1.s, z12.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z21.s, p1/m, z1.s, z13.s
+    fmla z22.s, p1/m, z1.s, z14.s
+    fmla z23.s, p1/m, z1.s, z15.s
+.endm
+
+.macro KERNELv1x8_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+    ld1rw  z12.s, p0/z, [pB, 16]
+    ld1rw  z13.s, p0/z, [pB, 20]
+    ld1rw  z14.s, p0/z, [pB, 24]
+    ld1rw  z15.s, p0/z, [pB, 28]
+
+    add pB, pB, 32
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    fmla z17.s, p1/m, z0.s, z9.s
+    fmla z18.s, p1/m, z0.s, z10.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z19.s, p1/m, z0.s, z11.s
+    fmla z20.s, p1/m, z0.s, z12.s
+    fmla z21.s, p1/m, z0.s, z13.s
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+    fmla z22.s, p1/m, z0.s, z14.s
+    fmla z23.s, p1/m, z0.s, z15.s
+
+.endm
+
+.macro SAVEv1x8
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    fmul z16.s, p1/m, z16.s, alphaZ
+    st1w  z16.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z17.s, p1/m, z17.s, alphaZ
+    st1w  z17.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z18.s, p1/m, z18.s, alphaZ
+    st1w z18.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z19.s, p1/m, z19.s, alphaZ
+    st1w  z19.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z20.s, p1/m, z20.s, alphaZ
+    st1w  z20.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z21.s, p1/m, z21.s, alphaZ
+    st1w  z21.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z22.s, p1/m, z22.s, alphaZ
+    st1w  z22.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    fmul z23.s, p1/m, z23.s, alphaZ
+    st1w  z23.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x4
+    dup         z16.s, #0
+    dup         z17.s, #0
+    dup         z18.s, #0
+    dup         z19.s, #0
+.endm
+
+.macro KERNELv1x4_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+    ld1rw  z10.s, p0/z, [pB, 8]
+    ld1rw  z11.s, p0/z, [pB, 12]
+
+    add pB, pB, 16
+
+    fmla z16.s, p1/m, z0.s, z8.s
+    fmla z17.s, p1/m, z0.s, z9.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z18.s, p1/m, z0.s, z10.s
+    fmla z19.s, p1/m, z0.s, z11.s
+
+.endm
+
+.macro SAVEv1x4
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    fmul z16.s, p1/m, z16.s, alphaZ
+    st1w  z16.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	add	pCRow2, pCRow1, LDC
+    fmul z17.s, p1/m, z17.s, alphaZ
+    st1w  z17.s, p1, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow2, LDC
+    fmul z18.s, p1/m, z18.s, alphaZ
+    st1w z18.s, p1, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    fmul z19.s, p1/m, z19.s, alphaZ
+    st1w  z19.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x2
+    dup         z16.s, #0
+    dup         z17.s, #0
+.endm
+
+.macro KERNELv1x2_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+    ld1rw  z9.s, p0/z,  [pB, 4]
+
+    add pB, pB, 8
+
+    fmla z16.s, p1/m, z0.s, z8.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+    fmla z17.s, p1/m, z0.s, z9.s
+
+.endm
+
+.macro SAVEv1x2
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	add	pCRow1, pCRow0, LDC
+    fmul z16.s, p1/m, z16.s, alphaZ
+    st1w  z16.s, p1, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+    fmul z17.s, p1/m, z17.s, alphaZ
+    st1w  z17.s, p1, [pCRow1]
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+/******************************************************************************/
+
+.macro INITv1x1
+    dup         z16.s, #0
+.endm
+
+.macro KERNELv1x1_SUB
+    ld1w  z0.s, p1/z, [pA] 
+	add	pA, pA, lanes, lsl #2	// pA = pA + lanes  * 4
+
+    ld1rw  z8.s, p0/z,  [pB]
+
+    add pB, pB, 4
+
+    fmla z16.s, p1/m, z0.s, z8.s
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+.endm
+
+.macro SAVEv1x1
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+    fmul z16.s, p1/m, z16.s, alphaZ
+    st1w  z16.s, p1, [pCRow0]
+
+
+	add	pCRow0, pCRow0, lanes, lsl #2	// pC = pC + lanes  * 4
+
+.endm
+
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alpha, s0
+	dup	alphaZ, alpha
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 8
+    ptrue p0.s                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	.Lstrmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
+
+	.align 5
+.Lstrmm_kernel_L8_BEGIN:
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #3 // add 8 x LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lstrmm_kernel_L8_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.s, counterI, origM      
+    cntp lanes, p0, p1.s                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lstrmm_kernel_L8_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+    INITv1x8                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	.Lstrmm_kernel_L8_Mv1_32
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lstrmm_kernel_L8_Mv1_22a
+
+	.align 5
+.Lstrmm_kernel_L8_Mv1_22:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L8_Mv1_22
+
+	.align 5
+.Lstrmm_kernel_L8_Mv1_22a:
+
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+	b	 .Lstrmm_kernel_L8_Mv1_44
+
+	.align 5
+.Lstrmm_kernel_L8_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lstrmm_kernel_L8_Mv1_40
+
+	KERNELv1x8_I
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_M2
+	KERNELv1x8_M1
+	KERNELv1x8_E
+
+
+	b	.Lstrmm_kernel_L8_Mv1_44
+
+.Lstrmm_kernel_L8_Mv1_40:
+
+	INITv1x8
+
+.Lstrmm_kernel_L8_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lstrmm_kernel_L8_Mv1_100
+
+	.align 5
+.Lstrmm_kernel_L8_Mv1_46:
+
+	KERNELv1x8_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lstrmm_kernel_L8_Mv1_46
+
+.Lstrmm_kernel_L8_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #8
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lstrmm_kernel_L8_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lstrmm_kernel_L8_Mv1_20   
+
+.Lstrmm_kernel_L8_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 8 * 4
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lstrmm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
+
+	.align 5
+.Lstrmm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #4
+	ble	.Lstrmm_kernel_L2_BEGIN
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #2 // add 4 x LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lstrmm_kernel_L4_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Lstrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Lstrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lstrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L4_Mv1_22
+
+.Lstrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lstrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lstrmm_kernel_L4_Mv1_46:
+
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lstrmm_kernel_L4_Mv1_46
+
+.Lstrmm_kernel_L4_Mv1_100:
+
+	SAVEv1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lstrmm_kernel_L4_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lstrmm_kernel_L4_Mv1_20   
+
+
+.Lstrmm_kernel_L4_END:
+	lsl	temp, origK, #4 
+	add	origPB, origPB, temp	// B = B + K * 4 * 4
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
+
+	.align 5
+.Lstrmm_kernel_L2_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #2
+	ble	.Lstrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC, lsl #1 // add 2 x LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lstrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Lstrmm_kernel_L2_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+    INITv1x2                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 4 to do?
+	ble	.Lstrmm_kernel_L2_Mv1_44
+
+	.align 5
+.Lstrmm_kernel_L2_Mv1_22:
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L2_Mv1_22
+
+.Lstrmm_kernel_L2_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lstrmm_kernel_L2_Mv1_100
+
+	.align 5
+.Lstrmm_kernel_L2_Mv1_46:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lstrmm_kernel_L2_Mv1_46
+
+.Lstrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+
+.Lstrmm_kernel_L2_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lstrmm_kernel_L2_Mv1_20   
+
+
+.Lstrmm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
+
+	.align 5
+.Lstrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lstrmm_kernel_L999 // done
+
+	mov	pCRow0, pC
+
+    add pC, pC, LDC // add 1 x LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lstrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.s, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.s
+
+	.align 5
+.Lstrmm_kernel_L1_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+    INITv1x1                     // fill with zeros
+
+	asr 	counterL , tempK, #3		// L = K / 8
+	cmp	counterL , #0			// is there at least 8 to do?
+	ble	.Lstrmm_kernel_L1_Mv1_44
+
+	.align 5
+.Lstrmm_kernel_L1_Mv1_22:
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_Mv1_22
+
+.Lstrmm_kernel_L1_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lstrmm_kernel_L1_Mv1_100
+
+	.align 5
+.Lstrmm_kernel_L1_Mv1_46:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lstrmm_kernel_L1_Mv1_46
+
+.Lstrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #2    // add tempOffset*lanes*4
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+
+
+.Lstrmm_kernel_L1_Mv1_END:
+
+    incw    counterI
+    whilelt p1.s, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.s
+    b.any   .Lstrmm_kernel_L1_Mv1_20   
+
+
+.Lstrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lstrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c
new file mode 100644
index 000000000..6ba4afc8b
--- /dev/null
+++ b/kernel/arm64/symm_lcopy_sve.c
@@ -0,0 +1,143 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
+    svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
+
+        svst1(pg, b, data_vec);
+
+        b += active;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
+    svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec);
+
+        svst1(pg, b, data_vec);
+
+        b += active;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c
new file mode 100644
index 000000000..32da5bd16
--- /dev/null
+++ b/kernel/arm64/symm_ucopy_sve.c
@@ -0,0 +1,143 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
+    svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda);
+    svint64_t gat_ind = svsel(cmp, temp2, temp1);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, one_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst1(pg, b, data_vec);
+
+        b += active;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec);
+    svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda);
+    svint32_t gat_ind = svsel(cmp, temp2, temp1);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, one_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst1(pg, b, data_vec);
+
+        b += active;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c
new file mode 100644
index 000000000..918e945ac
--- /dev/null
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@@ -0,0 +1,136 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY + posX * lda;
+        } else {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+                svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
+                svst1(pn, b, aj_vec);
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c
new file mode 100644
index 000000000..b76cc56de
--- /dev/null
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@@ -0,0 +1,136 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY + posX * lda;
+        } else {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+#ifdef DOUBLE
+                    svfloat64_t aj_vec = svld1(pn, ao);
+#else
+                    svfloat32_t aj_vec = svld1(pn, ao);
+#endif
+                    svst1(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+
+    return 0;
+}
diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c
new file mode 100644
index 000000000..75fa163ae
--- /dev/null
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@@ -0,0 +1,136 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX + posY * lda;
+        } else {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+                svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
+                svst1(pn, b, aj_vec);
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                    }
+#endif
+                    ao += n_active;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c
new file mode 100644
index 000000000..36a03242a
--- /dev/null
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@@ -0,0 +1,134 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX + posY * lda;
+        } else {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+#ifdef DOUBLE
+                    svfloat64_t aj_vec = svld1(pn, ao);
+#else
+                    svfloat32_t aj_vec = svld1(pn, ao);
+#endif
+                    svst1(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else { 
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c
new file mode 100644
index 000000000..fa1c6e984
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_LN_sve.c
@@ -0,0 +1,320 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+        *(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+        *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+        *(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+      m, n, k, offset);
+#endif
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    i = m % sve_size;
+    if (i) {
+      aa = a + (m - i) * k * COMPSIZE;
+      cc = c + (m - i)     * COMPSIZE;
+
+      if (k - kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa + i             * kk * COMPSIZE,
+            b  + GEMM_UNROLL_N * kk * COMPSIZE,
+            cc,
+            ldc);
+      }
+
+      solve(i, GEMM_UNROLL_N,
+          aa + (kk - i) * i             * COMPSIZE,
+          b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      kk -= i;
+
+    }
+
+    int mod = i;
+    i = sve_size;
+    if (i <= m) {
+      aa = a + (m - mod - sve_size) * k * COMPSIZE;
+      cc = c + (m - mod - sve_size)     * COMPSIZE;
+
+      do {
+        if (k - kk > 0) {
+          GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+              ZERO,
+#endif
+              aa + sve_size * kk * COMPSIZE,
+              b +  GEMM_UNROLL_N * kk * COMPSIZE,
+              cc,
+              ldc);
+        }
+
+        solve(sve_size, GEMM_UNROLL_N,
+            aa + (kk - sve_size) * sve_size * COMPSIZE,
+            b  + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE,
+            cc, ldc);
+
+        aa -= sve_size * k * COMPSIZE;
+        cc -= sve_size     * COMPSIZE;
+        kk -= sve_size;
+
+        i += sve_size;
+      } while (i <= m);
+    }
+
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = m + offset;
+
+        i = m % sve_size;
+        if (i) {
+          aa = a + (m - i) * k * COMPSIZE;
+          cc = c + (m - i)     * COMPSIZE;
+
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - i) * i * COMPSIZE,
+              b  + (kk - i) * j * COMPSIZE,
+              cc, ldc);
+
+          kk -= i;
+
+        }
+
+        int mod = i;
+        i = sve_size;
+        if (i <= m) {
+          aa = a + (m - mod - sve_size) * k * COMPSIZE;
+          cc = c + (m - mod - sve_size)     * COMPSIZE;
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(sve_size, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + sve_size * kk * COMPSIZE,
+                  b +  j             * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(sve_size, j,
+                aa + (kk - sve_size) * sve_size * COMPSIZE,
+                b  + (kk - sve_size) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa -= sve_size * k * COMPSIZE;
+            cc -= sve_size     * COMPSIZE;
+            kk -= sve_size;
+
+            i += sve_size;
+          } while (i <= m);
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c
new file mode 100644
index 000000000..2cbb2aafb
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_LT_sve.c
@@ -0,0 +1,295 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = sve_size;
+
+    while (i <= m) {
+
+      if (kk > 0) {
+        GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+
+      solve(sve_size, GEMM_UNROLL_N,
+          aa + kk * sve_size * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += sve_size * k * COMPSIZE;
+      cc += sve_size     * COMPSIZE;
+      kk += sve_size;
+      i += sve_size;
+    }
+
+    i = m % sve_size;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+      kk += i;
+
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += sve_size;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = offset;
+        aa = a;
+        cc = c;
+
+        i = sve_size;
+
+        while (i <= m) {
+          if (kk > 0) {
+            GEMM_KERNEL(sve_size, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(sve_size, j,
+              aa + kk * sve_size * COMPSIZE,
+              b  + kk * j             * COMPSIZE, cc, ldc);
+
+          aa += sve_size * k * COMPSIZE;
+          cc += sve_size     * COMPSIZE;
+          kk += sve_size;
+          i += sve_size;
+        }
+
+        i = m % sve_size;
+        if (i) {
+          if (kk > 0) {
+            GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(i, j,
+              aa + kk * i * COMPSIZE,
+              b  + kk * j * COMPSIZE, cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+          kk += i;
+
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c
new file mode 100644
index 000000000..5e4e8d9b1
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_RN_sve.c
@@ -0,0 +1,293 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = sve_size;
+
+    if (i <= m) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	solve(sve_size, GEMM_UNROLL_N,
+	      aa + kk * sve_size * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+
+	aa += sve_size * k * COMPSIZE;
+	cc += sve_size     * COMPSIZE;
+	i += sve_size;
+      } while (i <= m);
+    }
+
+
+    i = m % sve_size;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += sve_size;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+  i = sve_size;
+
+	while (i <= m) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(sve_size, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(sve_size, j,
+		aa + kk * sve_size * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += sve_size * k * COMPSIZE;
+	  cc += sve_size     * COMPSIZE;
+	  i += sve_size;
+	}
+
+  i = m % sve_size;
+  if (i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+  }
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c
new file mode 100644
index 000000000..c376c0e33
--- /dev/null
+++ b/kernel/arm64/trsm_kernel_RT_sve.c
@@ -0,0 +1,317 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include "arm_sve.h"
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+#ifdef DOUBLE
+  int sve_size = svcntd();
+#else
+  int sve_size = svcntw();
+#endif
+
+#if 0
+  fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+      m, n, k, offset);
+#endif
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+        aa  = a;
+        b -= j * k  * COMPSIZE;
+        c -= j * ldc* COMPSIZE;
+        cc  = c;
+
+        i = sve_size;
+        if (i <= m) {
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(sve_size, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + sve_size * kk * COMPSIZE,
+                  b  +  j            * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(sve_size, j,
+                aa + (kk - j) * sve_size * COMPSIZE,
+                b  + (kk - j) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa += sve_size * k * COMPSIZE;
+            cc += sve_size     * COMPSIZE;
+            i += sve_size;
+          } while (i <= m);
+        }
+
+        i = m % sve_size;
+        if (i) {
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - j) * i * COMPSIZE,
+              b  + (kk - j) * j * COMPSIZE,
+              cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+
+        }
+        kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = sve_size;
+      if (i <= m) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + sve_size * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(sve_size, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += sve_size * k * COMPSIZE;
+	  cc += sve_size     * COMPSIZE;
+	  i += sve_size;
+	} while (i <= m);
+      }
+
+      i = m % sve_size;
+      if (i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
new file mode 100644
index 000000000..5a9d4194a
--- /dev/null
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + j * n_active + k) = *(ao + k * lda + j);
+          }
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+          svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao++;
+        b += n_active;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
new file mode 100644
index 000000000..ac4019e26
--- /dev/null
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -0,0 +1,117 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + j * n_active + k) = *(ao + j * lda + k);
+          }
+        }
+        b += n_active * n_active;
+        ao += lda * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1(pn, ao);
+#else
+          svfloat32_t aj_vec = svld1(pn, ao);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active;
+        i ++;
+        ii ++;
+      }
+    } while (i < m);
+
+
+    a += n_active;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
new file mode 100644
index 000000000..8fdcd0f4b
--- /dev/null
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + j * n_active + k) = *(ao + k * lda + j);
+          }
+        }
+        ao += n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+#else
+          svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao++;
+        b += n_active;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
new file mode 100644
index 000000000..0f5f0dccd
--- /dev/null
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -0,0 +1,117 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + j * n_active + k) = *(ao + j * lda + k);
+          }
+          *(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += lda * n_active;
+        b += n_active * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec = svld1(pn, ao);
+#else
+          svfloat32_t aj_vec = svld1(pn, ao);
+#endif
+          svst1(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active;
+        i ++;
+        ii ++;
+      } 
+    } while (i < m);
+
+
+    a += n_active;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S
index f8e877f3c..a65c4f581 100644
--- a/kernel/arm64/zgemm_kernel_4x4.S
+++ b/kernel/arm64/zgemm_kernel_4x4.S
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow2		x14
 #define pCRow3		x15
 #define pA		x16
-#define alphaR		x17
-#define alphaI		x18
+#define alphaR		x19
+#define alphaI		x20
 
 #define alpha0_R	d10
 #define alphaV0_R	v10.d[0]
diff --git a/kernel/arm64/zgemm_kernel_4x4_cortexa53.c b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c
new file mode 100644
index 000000000..aa0f7d72d
--- /dev/null
+++ b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c
@@ -0,0 +1,736 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <arm_neon.h>
+
+/*******************************************************************************
+  The complex GEMM kernels in OpenBLAS use static configuration of conjugation
+modes via specific macros:
+
+  MACRO_NAME | conjugation on matrix A | conjugation on matrix B |
+  ---------- | ----------------------- | ----------------------- |
+ NN/NT/TN/TT |            No           |            No           |
+ NR/NC/TR/TC |            No           |           Yes           |
+ RN/RT/CN/CT |           Yes           |            No           |
+ RR/RC/CR/CC |           Yes           |           Yes           |
+
+  "conjugation on matrix A" means the complex conjugates of elements from
+matrix A are used for matmul (rather than the original elements). "conjugation
+on matrix B" means the complex conjugate of each element from matrix B is taken
+for matrix multiplication, respectively.
+
+  Complex numbers in arrays or matrices are usually packed together as an
+array of struct (without padding):
+  struct complex_number {
+    FLOAT real_part;
+    FLOAT imag_part;
+  };
+
+  For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF
+DOUBLE, the real part of its Kth complex number can be accessed as
+ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1].
+
+  This file uses 2 ways to vectorize matrix multiplication of complex numbers:
+
+(1) Expanded-form
+
+  During accumulation along direction K:
+
+                                        Σk(a[0][k].real b[k][n].real)
+                  accumulate            Σk(a[0][k].imag b[k][n].real)
+             ------------------->                     .
+             |   * b[k][n].real                       .
+             |     (broadcasted)                      .
+    a[0][k].real                        Σk(a[v-1][k].real b[k][n].real)
+    a[0][k].imag                        Σk(a[v-1][k].imag b[k][n].real)
+         .                                         VECTOR I
+(vec_a)  .
+         .
+    a[v-1][k].real                      Σk(a[0][k].real b[k][n].imag)
+    a[v-1][k].imag                      Σk(a[0][k].imag b[k][n].imag)
+             |                                        .
+             |   accumulate                           .
+             ------------------->                     .
+                 * b[k][n].imag         Σk(a[v-1][k].real b[k][n].imag)
+                  (broadcasted)         Σk(a[v-1][k].imag b[k][n].imag)
+                                                   VECTOR II
+
+  After accumulation, prior to storage:
+
+                                    -1     -Σk(a[0][k].imag b[k][n].imag)
+                                     1      Σk(a[0][k].real b[k][n].imag)
+                                     .                 .
+    VECTOR II permute and multiply   .  to get         .
+                                     .                 .
+                                    -1     -Σk(a[v-1][k].imag b[k][n].imag)
+                                     1      Σk(a[v-1][k].real b[k][n].imag)
+
+    then add with VECTOR I to get the result vector of elements of C.
+
+  2 vector registers are needed for every v elements of C, with
+v == sizeof(vector) / sizeof(complex)
+
+(2) Contracted-form
+
+  During accumulation along direction K:
+
+   (the K coordinate is not shown, since the operation is identical for each k)
+
+        (load vector in mem)                       (load vector in mem)
+  a[0].r a[0].i ... a[v-1].r a[v-1].i     a[v].r a[v].i ... a[2v-1].r a[2v-1]i
+              |                                                   |
+              |      unzip operation (or VLD2 in arm neon)        |
+              -----------------------------------------------------
+                                            |
+                                            |
+                --------------------------------------------------
+                |                                                |
+                |                                                |
+                v                                                v
+   a[0].real ... a[2v-1].real                     a[0].imag ... a[2v-1].imag
+               |         |                            |          | 
+               |         | * b[i].imag(broadcast)     |          |
+   * b[i].real |         -----------------------------|----      | * b[i].real
+   (broadcast) |                                      |   |      | (broadcast)
+               |        ------------------------------    |      |
+             + |      - |       * b[i].imag(broadcast)  + |    + |
+               v        v                                 v      v
+              (accumulate)                              (accumulate)
+        c[0].real ... c[2v-1].real                 c[0].imag ... c[2v-1].imag
+               VECTOR_REAL                                VECTOR_IMAG
+
+  After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved)
+then stored to matrix C directly.
+
+  For 2v elements of C, only 2 vector registers are needed, while
+4 registers are required for expanded-form.
+(v == sizeof(vector) / sizeof(complex))
+
+  For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers
+to store elements of C when using expanded-form calculation, where
+the register spilling will occur. So contracted-form operation is
+selected for 4x4 kernel. As for all other combinations of unroll parameters
+(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more
+NEON registers into usage to hide latency of multiply-add instructions.
+******************************************************************************/
+ 
+static inline float64x2_t set_f64x2(double lo, double hi) {
+  float64x2_t ret = vdupq_n_f64(0);
+  ret = vsetq_lane_f64(lo, ret, 0);
+  ret = vsetq_lane_f64(hi, ret, 1);
+  return ret;
+}
+
+static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) {
+  float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }};
+  return ret;
+}
+
+/*****************************************************************
+ * operation: *c += alpha * c_value //complex multiplication
+ * expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r }
+ * expanded_c: {{ arbr, aibr }, { arbi, aibi }}
+ ****************************************************************/
+static inline void store_1c(double *c, float64x2x2_t expanded_c,
+  float64x2x2_t expanded_alpha) {
+  float64x2_t ld = vld1q_f64(c);
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
+  double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
+  double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1);
+  double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0);
+#else
+  double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1);
+  double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0);
+#endif
+  ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real);
+  vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag));
+}
+
+static inline void pref_c_4(const double *c) {
+  __asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):);
+}
+
+static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) {
+  float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]),
+    vaddq_f64(ec1.val[1], ec2.val[1]) }};
+  return ret;
+}
+
+static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) {
+  float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }};
+  return ret;
+}
+
+static inline float64x2x2_t init() {
+  float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }};
+  return ret;
+}
+
+static inline void kernel_1x1(const double *sa, const double *sb, double *C,
+  BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init();
+
+  for (; K > 3; K -= 4) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
+      a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
+      b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
+    c1 = update_ec(c1, a1, b1);
+    c2 = update_ec(c2, a2, b2);
+    c3 = update_ec(c3, a3, b3);
+    c4 = update_ec(c4, a4, b4);
+  }
+  c1 = add_ec(c1, c2);
+  c3 = add_ec(c3, c4);
+  c1 = add_ec(c1, c3);
+  for (; K; K--) {
+    c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2;
+  }
+  store_1c(C, c1, expanded_alpha);
+}
+
+static inline void kernel_2x1(const double *sa, const double *sb, double *C,
+  BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init();
+
+  for (; K > 1; K -= 2) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
+      a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
+    c1 = update_ec(c1, a1, b1);
+    c2 = update_ec(c2, a2, b1);
+    c3 = update_ec(c3, a3, b2);
+    c4 = update_ec(c4, a4, b2);
+  }
+  c1 = add_ec(c1, c3);
+  c2 = add_ec(c2, c4);
+  if (K) {
+    float64x2_t b1 = vld1q_f64(sb);
+    c1 = update_ec(c1, vld1q_f64(sa), b1);
+    c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
+  }
+  store_1c(C, c1, expanded_alpha);
+  store_1c(C + 2, c2, expanded_alpha);
+}
+
+static inline void kernel_1x2(const double *sa, const double *sb, double *C,
+  BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init();
+
+  for (; K > 1; K -= 2) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
+      b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
+    c1 = update_ec(c1, a1, b1);
+    c2 = update_ec(c2, a1, b2);
+    c3 = update_ec(c3, a2, b3);
+    c4 = update_ec(c4, a2, b4);
+  }
+  c1 = add_ec(c1, c3);
+  c2 = add_ec(c2, c4);
+  if (K) {
+    float64x2_t a1 = vld1q_f64(sa);
+    c1 = update_ec(c1, a1, vld1q_f64(sb));
+    c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
+  }
+  store_1c(C, c1, expanded_alpha);
+  store_1c(C + LDC * 2, c2, expanded_alpha);
+}
+
+static inline void kernel_2x2(const double *sa, const double *sb, double *C,
+  BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init();
+
+  for (; K; K--) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
+    c1 = update_ec(c1, a1, b1);
+    c2 = update_ec(c2, a2, b1);
+    c3 = update_ec(c3, a1, b2);
+    c4 = update_ec(c4, a2, b2);
+  }
+  store_1c(C, c1, expanded_alpha);
+  store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
+  store_1c(C, c3, expanded_alpha);
+  store_1c(C + 2, c4, expanded_alpha);
+}
+
+static inline void kernel_4x1(const double *sa, const double *sb, double *C,
+  BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init();
+  pref_c_4(C);
+
+  for (; K; K--) {
+    float64x2_t b1 = vld1q_f64(sb); sb += 2;
+    c1 = update_ec(c1, vld1q_f64(sa), b1);
+    c2 = update_ec(c2, vld1q_f64(sa + 2), b1);
+    c3 = update_ec(c3, vld1q_f64(sa + 4), b1);
+    c4 = update_ec(c4, vld1q_f64(sa + 6), b1);
+    sa += 8;
+  }
+  store_1c(C, c1, expanded_alpha);
+  store_1c(C + 2, c2, expanded_alpha);
+  store_1c(C + 4, c3, expanded_alpha);
+  store_1c(C + 6, c4, expanded_alpha);
+}
+
+static inline void kernel_4x2(const double *sa, const double *sb, double *C,
+  BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
+  c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
+  pref_c_4(C);
+  pref_c_4(C + LDC * 2);
+
+  for (; K; K--) {
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4;
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2),
+      a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8;
+    c1 = update_ec(c1, a1, b1);
+    c2 = update_ec(c2, a2, b1);
+    c3 = update_ec(c3, a3, b1);
+    c4 = update_ec(c4, a4, b1);
+    c5 = update_ec(c5, a1, b2);
+    c6 = update_ec(c6, a2, b2);
+    c7 = update_ec(c7, a3, b2);
+    c8 = update_ec(c8, a4, b2);
+  }
+  store_1c(C, c1, expanded_alpha);
+  store_1c(C + 2, c2, expanded_alpha);
+  store_1c(C + 4, c3, expanded_alpha);
+  store_1c(C + 6, c4, expanded_alpha); C += LDC * 2;
+  store_1c(C, c5, expanded_alpha);
+  store_1c(C + 2, c6, expanded_alpha);
+  store_1c(C + 4, c7, expanded_alpha);
+  store_1c(C + 6, c8, expanded_alpha);
+}
+
+static inline void kernel_1x4(const double *sa, const double *sb, double *C,
+  BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4;
+  c1 = c2 = c3 = c4 = init();
+
+  for (; K; K--) {
+    float64x2_t a1 = vld1q_f64(sa); sa += 2;
+    c1 = update_ec(c1, a1, vld1q_f64(sb));
+    c2 = update_ec(c2, a1, vld1q_f64(sb + 2));
+    c3 = update_ec(c3, a1, vld1q_f64(sb + 4));
+    c4 = update_ec(c4, a1, vld1q_f64(sb + 6));
+    sb += 8;
+  }
+  store_1c(C, c1, expanded_alpha); C += LDC * 2;
+  store_1c(C, c2, expanded_alpha); C += LDC * 2;
+  store_1c(C, c3, expanded_alpha); C += LDC * 2;
+  store_1c(C, c4, expanded_alpha);
+}
+
+static inline void kernel_2x4(const double *sa, const double *sb, double *C,
+  BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
+
+  const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai);
+  float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8;
+  c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init();
+
+  for (; K; K--) {
+    float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4;
+    float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2),
+      b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8;
+    c1 = update_ec(c1, a1, b1);
+    c2 = update_ec(c2, a2, b1);
+    c3 = update_ec(c3, a1, b2);
+    c4 = update_ec(c4, a2, b2);
+    c5 = update_ec(c5, a1, b3);
+    c6 = update_ec(c6, a2, b3);
+    c7 = update_ec(c7, a1, b4);
+    c8 = update_ec(c8, a2, b4);
+  }
+  store_1c(C, c1, expanded_alpha);
+  store_1c(C + 2, c2, expanded_alpha); C += LDC * 2;
+  store_1c(C, c3, expanded_alpha);
+  store_1c(C + 2, c4, expanded_alpha); C += LDC * 2;
+  store_1c(C, c5, expanded_alpha);
+  store_1c(C + 2, c6, expanded_alpha); C += LDC * 2;
+  store_1c(C, c7, expanded_alpha);
+  store_1c(C + 2, c8, expanded_alpha);
+}
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define FMLA_RI "fmla "
+#define FMLA_IR "fmla "
+#define FMLA_II "fmls "
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define FMLA_RI "fmls "
+#define FMLA_IR "fmla "
+#define FMLA_II "fmla "
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define FMLA_RI "fmla "
+#define FMLA_IR "fmls "
+#define FMLA_II "fmla "
+#else
+#define FMLA_RI "fmls "
+#define FMLA_IR "fmls "
+#define FMLA_II "fmls "
+#endif
+#define FMLA_RR "fmla "
+
+static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i,
+  float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) {
+  float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4);
+  up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar);
+  up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai);
+  lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar);
+  lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai);
+  up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai);
+  up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar);
+  lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai);
+  lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar);
+  vst2q_f64(C, up);
+  vst2q_f64(C + 4, lo);
+}
+
+static inline void kernel_4x4(const double *sa, const double *sb, double *C,
+  BLASLONG LDC, BLASLONG K, double alphar, double alphai) {
+
+  float64x2_t c1r, c1i, c2r, c2i;
+  float64x2_t c3r, c3i, c4r, c4i;
+  float64x2_t c5r, c5i, c6r, c6i;
+  float64x2_t c7r, c7i, c8r, c8i;
+
+  const double *pref_ = C;
+  pref_c_4(pref_); pref_ += LDC * 2;
+  pref_c_4(pref_); pref_ += LDC * 2;
+  pref_c_4(pref_); pref_ += LDC * 2;
+  pref_c_4(pref_);
+
+  __asm__ __volatile__(
+    "cmp %[K],#0\n\t"
+    "movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t"
+    "movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t"
+    "movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t"
+    "movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t"
+    "beq 4f; cmp %[K],#2\n\t"
+    "ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t"
+    "ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t"
+    "ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t"
+    "beq 2f; blt 3f\n\t"
+    "1:\n\t"
+    "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
+    FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
+    FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
+    FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
+    "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
+    FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
+    FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
+    FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
+    "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
+    FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
+    FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
+    FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
+    "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
+    FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
+    FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
+    FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
+    "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
+    FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
+    FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
+    FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
+    "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
+    FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
+    FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
+    FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
+    "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
+    FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
+    FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
+    FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
+    "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
+    FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
+    FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
+    FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
+    "fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t"
+    FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t"
+    FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
+    FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
+    "fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t"
+    FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t"
+    FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
+    FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
+    "fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t"
+    FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t"
+    FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
+    FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
+    "fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t"
+    FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t"
+    FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
+    FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
+    "fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t"
+    FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t"
+    FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
+    FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
+    "fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t"
+    FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t"
+    FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
+    FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
+    "fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t"
+    FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t"
+    FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
+    FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
+    "fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t"
+    FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t"
+    FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
+    FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t"
+    FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
+    FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t"
+    FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
+    FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t"
+    FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
+    FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t"
+    FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
+    FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t"
+    FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
+    FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t"
+    FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
+    FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
+    FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
+    FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t"
+    FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
+    FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t"
+    "2:\n\t"
+    "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t"
+    FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t"
+    FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
+    FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
+    "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t"
+    FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t"
+    FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
+    FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
+    "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t"
+    FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t"
+    FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
+    FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
+    "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t"
+    FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t"
+    FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
+    FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
+    "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t"
+    FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t"
+    FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
+    FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
+    "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t"
+    FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t"
+    FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
+    FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
+    "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t"
+    FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t"
+    FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
+    FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
+    "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t"
+    FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t"
+    FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
+    FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
+    "fmov v15.d[1],x0\n\t"
+    FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
+    FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
+    FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
+    FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
+    FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
+    FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
+    FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
+    FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t"
+    FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t"
+    FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t"
+    FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t"
+    FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t"
+    FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t"
+    FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t"
+    FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t"
+    FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t"
+    FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t"
+    FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t"
+    FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t"
+    FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t"
+    FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t"
+    FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t"
+    FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t"
+    FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t"
+    FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t"
+    FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t"
+    FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t"
+    FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t"
+    FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t"
+    FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t"
+    FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t"
+    FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t"
+    FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t"
+    FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t"
+    FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t"
+    FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t"
+    FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t"
+    FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t"
+    FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t"
+    FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t"
+    "3:\n\t"
+    "fmov v7.d[1],x0\n\t"
+    FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t"
+    FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t"
+    FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t"
+    FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t"
+    FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t"
+    FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t"
+    FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t"
+    FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t"
+    FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t"
+    FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t"
+    FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t"
+    FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t"
+    FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t"
+    FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t"
+    FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t"
+    FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t"
+    FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t"
+    FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t"
+    FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t"
+    FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t"
+    FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t"
+    FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t"
+    FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t"
+    FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t"
+    FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t"
+    FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t"
+    FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t"
+    FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t"
+    FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t"
+    FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t"
+    FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t"
+    FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t"
+    "4:\n\t"
+   :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i),
+    [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i),
+    [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i),
+    [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i),
+    [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb)
+   ::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+     "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
+
+  store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2;
+  store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2;
+  store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2;
+  store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai);
+}
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai,
+  FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) {
+
+  BLASLONG n_left = N;
+  for (; n_left >= 4; n_left -= 4) {
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    BLASLONG m_left = M;
+    for (; m_left >= 4; m_left -= 4) {
+      kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai);
+      a_ += 8 * K;
+      c_ += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai);
+      a_ += 4 * K;
+      c_ += 4;
+    }
+    if (m_left) {
+      kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai);
+    }
+    sb += 8 * K;
+    C += 8 * LDC;
+  }
+  if (n_left >= 2) {
+    n_left -= 2;
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    BLASLONG m_left = M;
+    for (; m_left >= 4; m_left -= 4) {
+      kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai);
+      a_ += 8 * K;
+      c_ += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai);
+      a_ += 4 * K;
+      c_ += 4;
+    }
+    if (m_left) {
+      kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai);
+    }
+    sb += 4 * K;
+    C += 4 * LDC;
+  }
+  if (n_left) {
+    const FLOAT *a_ = sa;
+    FLOAT *c_ = C;
+    BLASLONG m_left = M;
+    for (; m_left >= 4; m_left -= 4) {
+      kernel_4x1(a_, sb, c_, K, alphar, alphai);
+      a_ += 8 * K;
+      c_ += 8;
+    }
+    if (m_left >= 2) {
+      m_left -= 2;
+      kernel_2x1(a_, sb, c_, K, alphar, alphai);
+      a_ += 4 * K;
+      c_ += 4;
+    }
+    if (m_left) {
+      kernel_1x1(a_, sb, c_, K, alphar, alphai);
+    }
+  }
+  return 0;
+}
+
diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..d5b35775c
--- /dev/null
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -0,0 +1,874 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		x19
+#define alphaI		x20
+
+#define alphaz_R	z6.d
+#define alphaz_I	z7.d
+#define alpha0_R	d6
+#define alpha0_I	d7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+	dup		z20.d, #0
+	dup		z21.d, #0
+	dup		z22.d, #0
+	dup		z23.d, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+	fmla	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.d, p1/m, z0.d, z9.d
+#else
+	fmla	z17.d, p1/m, z0.d, z9.d
+#endif
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+
+	fmla	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.d, p1/m, z0.d, z11.d
+#else
+	fmla	z19.d, p1/m, z0.d, z11.d
+#endif
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+
+	fmla	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.d, p1/m, z0.d, z13.d
+#else
+	fmla	z21.d, p1/m, z0.d, z13.d
+#endif
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+
+	fmla	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.d, p1/m, z0.d, z15.d
+#else
+	fmla	z23.d, p1/m, z0.d, z15.d
+#endif
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
+
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+    ld1rd  z12.d, p0/z,  [pB, 32]
+    ld1rd  z13.d, p0/z,  [pB, 40]
+    ld1rd  z14.d, p0/z,  [pB, 48]
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 64
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow1]
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ld2d	{z28.d, z29.d}, p1/z, [pCRow2]
+	fmla	z28.d, p1/m, z20.d, alphaz_R
+	fmls	z28.d, p1/m, z21.d, alphaz_I
+	fmla	z29.d, p1/m, z20.d, alphaz_I
+	fmla	z29.d, p1/m, z21.d, alphaz_R
+	st2d 	{z28.d, z29.d}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #4
+
+	ld2d	{z30.d, z31.d}, p1/z, [pCRow3]
+	fmla	z30.d, p1/m, z22.d, alphaz_R
+	fmls	z30.d, p1/m, z23.d, alphaz_I
+	fmla	z31.d, p1/m, z22.d, alphaz_I
+	fmla	z31.d, p1/m, z23.d, alphaz_R
+	st2d 	{z30.d, z31.d}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 32
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	ld2d	{z26.d, z27.d}, p1/z, [pCRow1]
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.d, #0
+	dup		z17.d, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, d0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, d1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+    ptrue p0.d                  // create true predicate 
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lzgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lzgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+.Lzgemm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_20:
+
+	mov	pB, origPB
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , origK, #3
+	cmp	counterL , #2
+	blt	.Lzgemm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lzgemm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L4_Mv1_22
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lzgemm_kernel_L4_Mv1_44
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lzgemm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lzgemm_kernel_L4_Mv1_44
+
+
+.Lzgemm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lzgemm_kernel_L4_Mv1_44:
+
+	ands	counterL , origK, #7
+	ble	.Lzgemm_kernel_L4_Mv1_100
+
+	.align 5
+.Lzgemm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lzgemm_kernel_L4_Mv1_46
+
+.Lzgemm_kernel_L4_Mv1_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lzgemm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lzgemm_kernel_L4_Mv1_20   
+
+
+
+.Lzgemm_kernel_L4_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lzgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lzgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lzgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lzgemm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lzgemm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lzgemm_kernel_L2_Mv1_40
+	.align 5
+
+.Lzgemm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_Mv1_22
+
+
+.Lzgemm_kernel_L2_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L2_Mv1_100
+
+.Lzgemm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L2_Mv1_42
+
+.Lzgemm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+.Lzgemm_kernel_L2_Mv1_END:
+
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lzgemm_kernel_L2_Mv1_20   
+
+
+.Lzgemm_kernel_L2_END:
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lzgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+.Lzgemm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lzgemm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lzgemm_kernel_L1_Mv1_40
+	.align 5
+
+.Lzgemm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_Mv1_22
+
+
+.Lzgemm_kernel_L1_Mv1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	.Lzgemm_kernel_L1_Mv1_100
+
+.Lzgemm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lzgemm_kernel_L1_Mv1_42
+
+.Lzgemm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+.Lzgemm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lzgemm_kernel_L1_Mv1_20   
+
+.Lzgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lzgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
new file mode 100644
index 000000000..8f9b4268a
--- /dev/null
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -0,0 +1,79 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    svint64_t lda_vec = svindex_s64(0LL, lda * 2);
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
+            svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
+            svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
+            aoffset1 += 2;
+            boffset += active * 2;
+        }
+        aoffset += active * lda * 2;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
new file mode 100644
index 000000000..c6e50bc1c
--- /dev/null
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -0,0 +1,75 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+// TODO: write in assembly with proper unrolling of inner loop
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
+
+    aoffset = a;
+    boffset = b;
+
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
+
+        aoffset1 = aoffset;
+
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1);
+            svst2_f64(pg, (double *) boffset, a_vec);
+            aoffset1 += lda * 2;
+            boffset += active * 2;
+        }
+        aoffset += active * 2;
+
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
+
+    } while (svptest_any(svptrue_b64(), pg));
+
+    return 0;
+}
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
new file mode 100644
index 000000000..37dbfe4e1
--- /dev/null
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -0,0 +1,172 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+#if defined(DOUBLE)
+  BLASLONG offset, i;
+
+  lda *= 2;
+
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint64_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
new file mode 100644
index 000000000..21e03b7be
--- /dev/null
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -0,0 +1,172 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+#if defined(DOUBLE)
+  BLASLONG offset, i;
+
+  lda *= 2;
+
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint64_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+#else
+  int offset, i;
+
+  lda *= 2;
+
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t j = 0;
+  int32_t N = n;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+        data_vec_imag = svneg_z(pg, data_vec_imag);
+        if (offset <= 0) {
+            svbool_t off_g = svwhilelt_b32(offset, 0);
+            data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
+        }
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+        // dealing with ZERO separately
+        if (offset > -active && offset < 1) 
+            b[ -2*offset + 1 ] = ZERO;
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c
new file mode 100644
index 000000000..6f18aa956
--- /dev/null
+++ b/kernel/arm64/zsymm_lcopy_sve.c
@@ -0,0 +1,150 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+  lda *= 2;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint64_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, 2);
+    temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
+    svint32_t temp2 = svmul_z(pg, temp, lda_vec);
+    temp2 = svmla_z(pg, temp2, posY_vec, 2);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, lda_vec);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c
new file mode 100644
index 000000000..6be48cdaf
--- /dev/null
+++ b/kernel/arm64/zsymm_ucopy_sve.c
@@ -0,0 +1,150 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <arm_sve.h>
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, offset;
+  lda *= 2;
+
+#if defined(DOUBLE)
+  uint64_t sve_size = svcntd();
+  svint64_t posY_vec = svdup_s64(posY);
+  svint64_t posX_vec = svdup_s64(posX);
+  svint64_t lda_vec = svdup_s64(lda);
+  svint64_t one_vec = svdup_s64(1LL);
+
+  int64_t j = 0;
+  svbool_t pg = svwhilelt_b64(j, n);
+  int64_t active = svcntp_b64(svptrue_b64(), pg);
+  svint64_t index_neg = svindex_s64(0LL, -1LL);
+  svint64_t index = svindex_s64(0LL, 1LL);
+  do {
+    offset = posX - posY;
+    svint64_t vec_off = svdup_s64(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint64_t temp = svadd_z(pg, posX_vec, index);
+    svint64_t temp1 = svmul_z(pg, temp, lda_vec);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint64_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint64_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s64(posX);
+    j += sve_size;
+    pg = svwhilelt_b64(j, n);
+    active = svcntp_b64(svptrue_b64(), pg);
+  } while (svptest_any(svptrue_b64(), pg));
+
+#else
+  uint32_t sve_size = svcntw();
+  svint32_t posY_vec = svdup_s32(posY);
+  svint32_t posX_vec = svdup_s32(posX);
+  svint32_t lda_vec = svdup_s32(lda);
+  svint32_t one_vec = svdup_s32(1);
+
+  int32_t N = n;
+  int32_t j = 0;
+  svbool_t pg = svwhilelt_b32(j, N);
+  int32_t active = svcntp_b32(svptrue_b32(), pg);
+  svint32_t index_neg = svindex_s32(0, -1);
+  svint32_t index = svindex_s32(0, 1);
+  do {
+    offset = posX - posY;
+    svint32_t vec_off = svdup_s32(offset);
+    svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
+
+    svint32_t temp = svadd_z(pg, posX_vec, index);
+    svint32_t temp1 = svmul_z(pg, temp, lda_vec);
+    temp1 = svmla_z(pg, temp1, posY_vec, 2);
+    svint32_t temp2 = svmul_z(pg, temp, 2);
+    temp2 = svmla_z(pg, temp2, posY_vec, lda);
+    svint32_t gat_ind = svsel(cmp, temp1, temp2);
+
+    i = m;
+    while (i>0) {
+        svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
+        svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
+
+        gat_ind = svadd_m(cmp, gat_ind, 2);
+        gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
+
+        svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
+
+        b += active * 2;
+        offset --;
+        vec_off = svsub_z(pg, vec_off, one_vec);
+        cmp = svcmpgt(pg, vec_off, index_neg);
+        
+        i--;
+    }
+
+    posX += sve_size;
+    posX_vec = svdup_s32(posX);
+    j += sve_size;
+    pg = svwhilelt_b32(j, N);
+    active = svcntp_b32(svptrue_b32(), pg);
+  } while (svptest_any(svptrue_b32(), pg));
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S
index 462acfe2b..cd053b896 100644
--- a/kernel/arm64/ztrmm_kernel_4x4.S
+++ b/kernel/arm64/ztrmm_kernel_4x4.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		x17
-#define alphaI		x18
+#define alphaI		x22
 #define temp		x19
 #define tempOffset	x20
 #define tempK		x21
diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
new file mode 100644
index 000000000..b71a3d39e
--- /dev/null
+++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S
@@ -0,0 +1,1006 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define lanes		x17
+
+#define alphaR		x19
+#define alphaI		x20
+#define temp		x21
+#define tempOffset	x22
+#define tempK		x23
+
+#define alphaz_R	z6.d
+#define alphaz_I	z7.d
+#define alpha0_R	d6
+#define alpha0_I	d7
+
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pCRow3
+// 16 pA
+// 17 alpha_save_R
+// 18 must save alpha_save_I
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA00_R, pA01_R
+//v01 ALPHA_I -> pA00_I, pA01_I
+//v02 pA02_R, pA03_R
+//v03 pA02_I, pA03_I
+//v04 pA10_R, pA11_R
+//v05 pA10_I, pA11_I
+//v06 pA12_R, pA13_R
+//v07 pA12_I, pA13_I
+//v08 must save pB00_R, pB01_R
+//v09 must save pB00_I, pB01_I
+//v10 must save pB02_R, pB03_R OR ALPHA0_R
+//v11 must save pB02_I, pB03_I OR ALPHA0_I
+//v12 must save pB10_R, pB11_R
+//v13 must save pB10_I, pB11_I
+//v14 must save pB12_R, pB13_R OR ALPHA1_R
+//v15 must save pB12_I, pB13_I OR ALPHA1_R
+//v16 pC0R
+//v17 pC0I
+//v18 pC1R
+//v19 pC1I
+//v20 pC2R
+//v21 pC2I
+//v22 pC3R
+//v23 pC3I
+//v24 pC3R
+//v25 pC3I
+//v26 pC22_R, pC23_R
+//v27 pC22_I, pC23_I
+//v28 pC30_R, pC31_R
+//v29 pC30_I, pC31_I
+//v30 pC32_R, pC33_R
+//v31 pC32_I, pC33_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INITv1x4
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+	dup		z20.d, #0
+	dup		z21.d, #0
+	dup		z22.d, #0
+	dup		z23.d, #0
+.endm
+
+.macro KERNELv1x4_I
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+	ld2d	{z2.d, z3.d}, p1/z, [pA] // next one
+	add	pA, pA, lanes, lsl #4    // pA += lanes*2*8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z, [pB, 16]
+    ld1rd  z11.d, p0/z, [pB, 24]
+    ld1rd  z12.d, p0/z, [pB, 32]
+    ld1rd  z13.d, p0/z, [pB, 40]
+    ld1rd  z14.d, p0/z, [pB, 48]
+    ld1rd  z15.d, p0/z, [pB, 56]
+
+    add pB, pB, 64
+
+	fmla	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z17.16b, z17.16b, z17.16b
+	fmls	z17.d, p1/m, z0.d, z9.d
+#else
+	fmla	z17.d, p1/m, z0.d, z9.d
+#endif
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+
+	fmla	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z19.16b, z21.16b, z21.16b
+	fmls	z19.d, p1/m, z0.d, z11.d
+#else
+	fmla	z19.d, p1/m, z0.d, z11.d
+#endif
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+
+	fmla	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z21.16b, z23.16b, z23.16b
+	fmls	z21.d, p1/m, z0.d, z13.d
+#else
+	fmla	z21.d, p1/m, z0.d, z13.d
+#endif
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+
+	fmla	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	#eor	z23.16b, z19.16b, z19.16b
+	fmls	z23.d, p1/m, z0.d, z15.d
+#else
+	fmla	z23.d, p1/m, z0.d, z15.d
+#endif
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M1
+	ld2d	{z2.d, z3.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+    add pB, pB, 64
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_M2
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes *2 * 8
+
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+    ld1rd  z8.d, p0/z,  [pB]
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+    ld1rd  z10.d, p0/z,  [pB, 16]
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+    ld1rd  z12.d, p0/z,  [pB, 32]
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+    ld1rd  z13.d, p0/z,  [pB, 40]
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+    ld1rd  z14.d, p0/z,  [pB, 48]
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+    add pB, pB, 64
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+.endm
+
+.macro KERNELv1x4_E
+	OP_rr	z16.d, p1/m, z2.d, z8.d
+	OP_ir	z17.d, p1/m, z3.d, z8.d
+	OP_ii	z16.d, p1/m, z3.d, z9.d
+	OP_ri	z17.d, p1/m, z2.d, z9.d
+
+	OP_rr	z18.d, p1/m, z2.d, z10.d
+	OP_ir	z19.d, p1/m, z3.d, z10.d
+	OP_ii	z18.d, p1/m, z3.d, z11.d
+	OP_ri	z19.d, p1/m, z2.d, z11.d
+
+	OP_rr	z20.d, p1/m, z2.d, z12.d
+	OP_ir	z21.d, p1/m, z3.d, z12.d
+	OP_ii	z20.d, p1/m, z3.d, z13.d
+	OP_ri	z21.d, p1/m, z2.d, z13.d
+
+	OP_rr	z22.d, p1/m, z2.d, z14.d
+	OP_ir	z23.d, p1/m, z3.d, z14.d
+	OP_ii	z22.d, p1/m, z3.d, z15.d
+	OP_ri	z23.d, p1/m, z2.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
+
+.endm
+
+.macro KERNELv1x4_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+    ld1rd  z12.d, p0/z,  [pB, 32]
+    ld1rd  z13.d, p0/z,  [pB, 40]
+    ld1rd  z14.d, p0/z,  [pB, 48]
+    ld1rd  z15.d, p0/z,  [pB, 56]
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 64
+
+	OP_rr	z20.d, p1/m, z0.d, z12.d
+	OP_ir	z21.d, p1/m, z1.d, z12.d
+	OP_ii	z20.d, p1/m, z1.d, z13.d
+	OP_ri	z21.d, p1/m, z0.d, z13.d
+
+	OP_rr	z22.d, p1/m, z0.d, z14.d
+	OP_ir	z23.d, p1/m, z1.d, z14.d
+	OP_ii	z22.d, p1/m, z1.d, z15.d
+	OP_ri	z23.d, p1/m, z0.d, z15.d
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+.endm
+
+.macro SAVEv1x4
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	eor	z28.d, z16.d, z16.d
+	eor	z29.d, z16.d, z16.d
+	fmla	z28.d, p1/m, z20.d, alphaz_R
+	fmls	z28.d, p1/m, z21.d, alphaz_I
+	fmla	z29.d, p1/m, z20.d, alphaz_I
+	fmla	z29.d, p1/m, z21.d, alphaz_R
+	st2d 	{z28.d, z29.d}, p1, [pCRow2]
+
+	add	pCRow2, pCRow2, lanes, lsl #4
+
+	eor	z30.d, z16.d, z16.d
+	eor	z31.d, z16.d, z16.d
+	fmla	z30.d, p1/m, z22.d, alphaz_R
+	fmls	z30.d, p1/m, z23.d, alphaz_I
+	fmla	z31.d, p1/m, z22.d, alphaz_I
+	fmla	z31.d, p1/m, z23.d, alphaz_R
+	st2d 	{z30.d, z31.d}, p1, [pCRow3]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x2
+	dup		z16.d, #0
+	dup		z17.d, #0
+	dup		z18.d, #0
+	dup		z19.d, #0
+.endm
+
+.macro KERNELv1x2_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+    ld1rd  z10.d, p0/z,  [pB, 16]
+    ld1rd  z11.d, p0/z,  [pB, 24]
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+
+	OP_rr	z18.d, p1/m, z0.d, z10.d
+	OP_ir	z19.d, p1/m, z1.d, z10.d
+	OP_ii	z18.d, p1/m, z1.d, z11.d
+	OP_ri	z19.d, p1/m, z0.d, z11.d
+
+    add pB, pB, 32
+.endm
+
+.macro SAVEv1x2
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4
+
+	eor	z26.d, z16.d, z16.d
+	eor	z27.d, z16.d, z16.d
+	fmla	z26.d, p1/m, z18.d, alphaz_R
+	fmls	z26.d, p1/m, z19.d, alphaz_I
+	fmla	z27.d, p1/m, z18.d, alphaz_I
+	fmla	z27.d, p1/m, z19.d, alphaz_R
+	st2d 	{z26.d, z27.d}, p1, [pCRow1]
+
+	add	pCRow1, pCRow1, lanes, lsl #4
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+
+.macro INITv1x1
+	dup		z16.d, #0
+	dup		z17.d, #0
+.endm
+
+
+.macro KERNELv1x1_SUB
+	ld2d	{z0.d, z1.d}, p1/z, [pA]
+	add	pA, pA, lanes, lsl #4	// pA = pA + lanes* 2  * 8
+
+    ld1rd  z8.d, p0/z,  [pB]
+    ld1rd  z9.d, p0/z,  [pB, 8]
+
+    add pB, pB, 16
+
+	OP_rr	z16.d, p1/m, z0.d, z8.d
+	OP_ir	z17.d, p1/m, z1.d, z8.d
+	OP_ii	z16.d, p1/m, z1.d, z9.d
+	OP_ri	z17.d, p1/m, z0.d, z9.d
+.endm
+
+.macro SAVEv1x1
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	eor	z24.d, z16.d, z16.d
+	eor	z25.d, z16.d, z16.d
+	fmla	z24.d, p1/m, z16.d, alphaz_R
+	fmls	z24.d, p1/m, z17.d, alphaz_I
+	fmla	z25.d, p1/m, z16.d, alphaz_I
+	fmla	z25.d, p1/m, z17.d, alphaz_R
+	st2d 	{z24.d, z25.d}, p1, [pCRow0]
+
+	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+.endm
+
+/******************************************************************************/
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
+	fmov	alphaR, d0
+	dup	    alphaz_R, alphaR
+	fmov	alphaI, d1
+	dup	    alphaz_I, alphaI
+
+	lsl	LDC, LDC, #4			// ldc = ldc * 2 * 8
+    ptrue p0.d                  // create true predicate 
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+// Loop over N
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	.Lztrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+.Lztrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+
+	add	pC, pCRow3, LDC
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+.Lztrmm_kernel_L4_Mv1_BEGIN:
+
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
+    mov counterI, #0
+    whilelt p1.d, counterI, origM   
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #4
+#endif
+    INITv1x4                     // fill with zeros
+
+	asr 	counterL , tempK, #3
+	cmp	counterL , #2
+	blt	.Lztrmm_kernel_L4_Mv1_32
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	.Lztrmm_kernel_L4_Mv1_22a
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_22:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L4_Mv1_22
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_22a:
+
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	 .Lztrmm_kernel_L4_Mv1_44
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_32:
+
+	tst	counterL, #1
+	ble	.Lztrmm_kernel_L4_Mv1_40
+
+	KERNELv1x4_I
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_M2
+	KERNELv1x4_M1
+	KERNELv1x4_E
+
+	b	.Lztrmm_kernel_L4_Mv1_44
+
+
+.Lztrmm_kernel_L4_Mv1_40:
+
+	INITv1x4
+
+.Lztrmm_kernel_L4_Mv1_44:
+
+	ands	counterL , tempK, #7
+	ble	.Lztrmm_kernel_L4_Mv1_100
+
+	.align 5
+.Lztrmm_kernel_L4_Mv1_46:
+	KERNELv1x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	.Lztrmm_kernel_L4_Mv1_46
+
+.Lztrmm_kernel_L4_Mv1_100:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #4
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
+
+	SAVEv1x4
+
+.Lztrmm_kernel_L4_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
+    b.any   .Lztrmm_kernel_L4_Mv1_20   
+
+
+
+.Lztrmm_kernel_L4_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 4 * 8 * 2
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	.Lztrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	.Lztrmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	.Lztrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+	add	pCRow1, pCRow0, LDC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+
+
+.Lztrmm_kernel_L2_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lztrmm_kernel_L2_Mv1_20:
+
+	INITv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	.Lztrmm_kernel_L2_Mv1_40
+	.align 5
+
+.Lztrmm_kernel_L2_Mv1_22:
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L2_Mv1_22
+
+
+.Lztrmm_kernel_L2_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lztrmm_kernel_L2_Mv1_100
+
+.Lztrmm_kernel_L2_Mv1_42:
+
+	KERNELv1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L2_Mv1_42
+
+.Lztrmm_kernel_L2_Mv1_100:
+
+	SAVEv1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #2
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lztrmm_kernel_L2_Mv1_END:
+
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lztrmm_kernel_L2_Mv1_20   
+
+
+.Lztrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+	lsl	temp, origK, #5
+	add	origPB, origPB, temp // B = B + K * 2 * 8 * 2
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	.Lztrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+.Lztrmm_kernel_L1_Mv1_BEGIN:
+
+    mov counterI, #0
+    whilelt p1.d, counterI, origM               //SVE instruction
+    cntp lanes, p0, p1.d
+
+
+.Lztrmm_kernel_L1_Mv1_20:
+
+	INITv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	mul	temp, tempOffset, lanes
+	add	pA, pA, temp, lsl #4   // add tempOffset*lanes*8*2
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, lanes
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	.Lztrmm_kernel_L1_Mv1_40
+	.align 5
+
+.Lztrmm_kernel_L1_Mv1_22:
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L1_Mv1_22
+
+
+.Lztrmm_kernel_L1_Mv1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	.Lztrmm_kernel_L1_Mv1_100
+
+.Lztrmm_kernel_L1_Mv1_42:
+
+	KERNELv1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	.Lztrmm_kernel_L1_Mv1_42
+
+.Lztrmm_kernel_L1_Mv1_100:
+
+	SAVEv1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, lanes
+#else
+	sub	tempK, tempK, #1
+#endif
+	mul	temp, tempK, lanes
+	add	pA, pA, temp, lsl #4  // add tempOffset*lanes*8*2
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, lanes
+#endif
+
+.Lztrmm_kernel_L1_Mv1_END:
+
+    incd    counterI
+    whilelt p1.d, counterI, origM             //SVE instruction
+    cntp lanes, p0, p1.d
+    b.any   .Lztrmm_kernel_L1_Mv1_20   
+
+.Lztrmm_kernel_L1_END:
+
+/******************************************************************************/
+
+.Lztrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
new file mode 100644
index 000000000..d34f607ab
--- /dev/null
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
new file mode 100644
index 000000000..7f34c9857
--- /dev/null
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -0,0 +1,143 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
new file mode 100644
index 000000000..7eb9452c9
--- /dev/null
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j*2);
+                            b[temp++] = *(ao+k*lda+j*2+1);
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
new file mode 100644
index 000000000..60c8ff3b4
--- /dev/null
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -0,0 +1,141 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else { 
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+j*lda+k*2);
+                            b[temp++] = *(ao+j*lda+k*2+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c
new file mode 100644
index 000000000..eb7cd0294
--- /dev/null
+++ b/kernel/arm64/ztrsm_lncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
+          }
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+          svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+          svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+        }
+        ao += 2;
+        b += n_active * 2;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
new file mode 100644
index 000000000..34dbf8a30
--- /dev/null
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -0,0 +1,115 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
+          }
+        }
+        b += n_active * n_active * 2;
+        ao += lda * n_active;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+          svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+          svst2(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active * 2;
+        i ++;
+        ii ++;
+      }
+    } while (i < m);
+
+
+    a += n_active * 2;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c
new file mode 100644
index 000000000..92e086b75
--- /dev/null
+++ b/kernel/arm64/ztrsm_uncopy_sve.c
@@ -0,0 +1,119 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svint64_t index = svindex_s64(0LL, lda);
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svint32_t index = svindex_s32(0, lda);
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+          for (int k = j+1; k < n_active; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
+          }
+        }
+        ao += n_active * 2;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii < jj) {
+#ifdef DOUBLE
+          svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+          svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+          svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+          svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+        }
+        ao += 2;
+        b += n_active * 2;
+        i++;
+        ii++;
+      }
+    } while (i < m);
+
+
+    a += n_active * lda;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
new file mode 100644
index 000000000..ccb942e1b
--- /dev/null
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -0,0 +1,115 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "arm_sve.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, jj;
+
+  FLOAT *ao;
+
+  lda *= 2;
+
+  jj = offset;
+#ifdef DOUBLE
+  int64_t js = 0;
+  svbool_t pn = svwhilelt_b64(js, n);
+  int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+  int32_t N = n;
+  int32_t js = 0;
+  svbool_t pn = svwhilelt_b32(js, N);
+  int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+  do {
+
+    ao = a;
+
+    i = 0;
+    ii = 0;
+    do {
+
+      if (ii == jj) {
+        for (int j = 0; j < n_active; j++) {
+          for (int k = 0; k < j; k++) {
+            *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
+            *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
+          }
+          compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
+          //*(b + j * n_active + j) = INV(*(ao + j * lda + j));
+        }
+        ao += lda * n_active;
+        b += n_active * n_active * 2;
+        i += n_active;
+        ii += n_active;
+      } else {
+        if (ii > jj) {
+#ifdef DOUBLE
+          svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+          svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+          svst2(pn, b, aj_vec);
+        }
+        ao += lda;
+        b += n_active * 2;
+        i ++;
+        ii ++;
+      } 
+    } while (i < m);
+
+
+    a += n_active * 2;
+    jj += n_active;
+
+    js += n_active;
+#ifdef DOUBLE
+    pn = svwhilelt_b64(js, n);
+    n_active = svcntp_b64(svptrue_b64(), pn);
+  } while (svptest_any(svptrue_b64(), pn));
+#else
+    pn = svwhilelt_b32(js, N);
+    n_active = svcntp_b32(svptrue_b32(), pn);
+  } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+return 0;
+}
diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL
new file mode 100644
index 000000000..afa8a0881
--- /dev/null
+++ b/kernel/e2k/KERNEL
@@ -0,0 +1,149 @@
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+LSAME_KERNEL	= ../generic/lsame.c
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+
diff --git a/kernel/e2k/Makefile b/kernel/e2k/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/e2k/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c
index 5abbb735c..84568ee0b 100644
--- a/kernel/generic/dot.c
+++ b/kernel/generic/dot.c
@@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
-        int n1 = n & -4;
 #if V_SIMD && !defined(DSDOT)
         const int vstep = v_nlanes_f32;
         const int unrollx4 = n & (-vstep * 4);
@@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
         }
         dot = v_sum_f32(vsum0);
 #elif defined(DSDOT)
+        int n1 = n & -4;
 		for (; i < n1; i += 4)
 		{
 			dot += (double) y[i] * (double) x[i]
@@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 			    + (double) y[i+3] * (double) x[i+3] ;
 		}
 #else
+        int n1 = n & -4;
 		for (; i < n1; i += 4)
 		{
 			dot += y[i] * x[i]
diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c
new file mode 100644
index 000000000..543e7e047
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_nn.c
@@ -0,0 +1,56 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i+k*lda] * B[k+j*ldb];
+			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
+		}
+	}
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c
new file mode 100644
index 000000000..d4a7aec6a
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_nt.c
@@ -0,0 +1,56 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i+k*lda] * B[k*ldb+j];
+			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
+		}
+	}
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c
new file mode 100644
index 000000000..2747337f2
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_tn.c
@@ -0,0 +1,57 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i*lda+k] * B[k+j*ldb];
+			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c
new file mode 100644
index 000000000..eec926bc7
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_kernel_tt.c
@@ -0,0 +1,57 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	//naive implemtation
+	//Column major
+
+	BLASLONG i,j,k;
+	FLOAT result=0.0;
+
+	for(i=0; i<M; i++){
+		for(j=0; j<N; j++){
+			result=0.0;
+			for(k=0; k<K; k++){
+				result += A[i*lda+k] * B[k*ldb+j];
+			}
+#ifdef B0
+			C[i+j*ldc]=alpha * result;
+#else
+			C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
+#endif
+		}
+	}
+
+	return 0;
+}
diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c
new file mode 100644
index 000000000..1ae6d2520
--- /dev/null
+++ b/kernel/generic/gemm_small_matrix_permit.c
@@ -0,0 +1,40 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	return 0;
+/*
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK <= 100.0*100.0*100.0)
+		return 1;
+	else
+		return 0;
+*/
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c
new file mode 100644
index 000000000..b830db228
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifndef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	FLOAT real, imag;
+#ifndef B0
+	FLOAT tmp0, tmp1;
+#endif
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(NN)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
+					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]        
+				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(NR)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]      
+					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]      
+				       + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(RN)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+					 +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#elif defined(RR)
+				real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+					 -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+				       - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
+#endif
+			}
+
+#ifndef B0
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c
new file mode 100644
index 000000000..65c455ea9
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c
@@ -0,0 +1,93 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifndef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	FLOAT real, imag;
+#ifndef B0
+	FLOAT tmp0, tmp1;
+#endif
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(NT)
+				real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
+				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(NC)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]          
+					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+				       + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(RT)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]    
+				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(RC)
+		                real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+					 -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+				       - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
+
+#endif
+			}
+
+#ifndef B0
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c
new file mode 100644
index 000000000..356d42460
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c
@@ -0,0 +1,93 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifndef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	FLOAT real, imag;
+#ifndef B0
+	FLOAT tmp0, tmp1;
+#endif
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(TN)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(TR)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(CN)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#elif defined(CR)
+				real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+					 -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+				       - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
+
+#endif
+			}
+
+#ifndef B0
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c
new file mode 100644
index 000000000..39f18303b
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c
@@ -0,0 +1,93 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifndef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	FLOAT real, imag;
+#ifndef B0
+	FLOAT tmp0, tmp1;
+#endif
+	int i, j, l;
+	for(i = 0; i < M; i++){
+		for(j = 0; j < N; j++){
+			real=0;
+			imag=0;
+
+			for(l = 0; l < K; l++){
+#if defined(TT)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(TC)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(CT)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#elif defined(CC)
+				real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+					 -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
+
+				imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+				       - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
+
+#endif
+			}
+
+#ifndef B0
+			tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
+			tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
+
+
+			C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
+#else
+			C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
+			C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
+#endif
+		}
+	}
+	
+	return 0;
+}
diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c
new file mode 100644
index 000000000..940ff5dc8
--- /dev/null
+++ b/kernel/generic/zgemm_small_matrix_permit.c
@@ -0,0 +1,40 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1)
+{
+	return 0;
+/*
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK <= 100.0*100.0*100.0)
+		return 1;
+	else
+		return 0;
+*/
+}
diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL
new file mode 100644
index 000000000..1c11df9b6
--- /dev/null
+++ b/kernel/loongarch64/KERNEL
@@ -0,0 +1,238 @@
+ifndef SAXPYKERNEL
+SAXPYKERNEL = ../arm/axpy.c
+endif
+
+ifndef DAXPYKERNEL
+DAXPYKERNEL = ../arm/axpy.c
+endif
+
+ifndef CAXPYKERNEL
+CAXPYKERNEL = ../arm/zaxpy.c
+endif
+
+ifndef ZAXPYKERNEL
+ZAXPYKERNEL = ../arm/zaxpy.c
+endif
+
+ifndef SROTKERNEL
+SROTKERNEL  = ../arm/rot.c
+endif
+
+ifndef DROTKERNEL
+DROTKERNEL  = ../arm/rot.c
+endif
+
+ifndef CROTKERNEL
+CROTKERNEL  = ../arm/zrot.c
+endif
+
+ifndef ZROTKERNEL
+ZROTKERNEL  = ../arm/zrot.c
+endif
+
+ifndef CSWAPKERNEL
+CSWAPKERNEL = ../arm/zswap.c
+endif
+
+ifndef ZSWAPKERNEL
+ZSWAPKERNEL = ../arm/zswap.c
+endif
+
+ifndef SSUMKERNEL
+SSUMKERNEL  = ../arm/sum.c
+endif
+
+ifndef DSUMKERNEL
+DSUMKERNEL  = ../arm/sum.c
+endif
+
+ifndef CSUMKERNEL
+CSUMKERNEL  = ../arm/zsum.c
+endif
+
+ifndef ZSUMKERNEL
+ZSUMKERNEL  = ../arm/zsum.c
+endif
+
+ifndef ISMAXKERNEL
+ISMAXKERNEL = ../arm/imax.c
+endif
+
+ifndef IDMAXKERNEL
+IDMAXKERNEL = ../arm/imax.c
+endif
+
+ifndef ISMINKERNEL
+ISMINKERNEL = ../arm/imin.c
+endif
+
+ifndef IDMINKERNEL
+IDMINKERNEL = ../arm/imin.c
+endif
+
+ifndef SNRM2KERNEL
+SNRM2KERNEL = snrm2.S
+endif
+
+ifndef DNRM2KERNEL
+DNRM2KERNEL = dnrm2.S
+endif
+
+ifndef CNRM2KERNEL
+CNRM2KERNEL = cnrm2.S
+endif
+
+ifndef ZNRM2KERNEL
+ZNRM2KERNEL = znrm2.S
+endif
+
+ifndef SCABS_KERNEL
+SCABS_KERNEL   = ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL   = ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL   = ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL   = ../generic/lsame.c
+endif
+
+ifndef SGEMMKERNEL
+SGEMMKERNEL    =  gemm_kernel.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_2.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_2.c
+SGEMMONCOPY    = ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    = ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ =  sgemm_incopy.o
+SGEMMITCOPYOBJ =  sgemm_itcopy.o
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+endif
+
+ifndef DGEMMKERNEL
+DGEMMKERNEL    =  gemm_kernel.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_8.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =  dgemm_incopy.o
+DGEMMITCOPYOBJ =  dgemm_itcopy.o
+DGEMMONCOPYOBJ =  dgemm_oncopy.o
+DGEMMOTCOPYOBJ =  dgemm_otcopy.o
+endif
+
+ifndef CGEMMKERNEL
+CGEMMKERNEL    =  zgemm_kernel.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_1.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_1.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMINCOPYOBJ =  cgemm_incopy.o
+CGEMMITCOPYOBJ =  cgemm_itcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+endif
+
+ifndef ZGEMMKERNEL
+ZGEMMKERNEL    =  zgemm_kernel.S
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY    = ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+ifndef STRSMKERNEL_LN
+STRSMKERNEL_LN =  trsm_kernel_LN.S
+endif
+
+ifndef STRSMKERNEL_LT
+STRSMKERNEL_LT =  trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RN
+STRSMKERNEL_RN =  trsm_kernel_LT.S
+endif
+
+ifndef STRSMKERNEL_RT
+STRSMKERNEL_RT =  trsm_kernel_RT.S
+endif
+
+ifndef DTRSMKERNEL_LN
+DTRSMKERNEL_LN =  trsm_kernel_LN.S
+endif
+
+ifndef DTRSMKERNEL_LT
+DTRSMKERNEL_LT =  trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RN
+DTRSMKERNEL_RN =  trsm_kernel_LT.S
+endif
+
+ifndef DTRSMKERNEL_RT
+DTRSMKERNEL_RT =  trsm_kernel_RT.S
+endif
+
+ifndef CTRSMKERNEL_LN
+CTRSMKERNEL_LN =  ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_LT
+CTRSMKERNEL_LT =  ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_RN
+CTRSMKERNEL_RN =  ztrsm_kernel_LT.S
+endif
+
+ifndef CTRSMKERNEL_RT
+CTRSMKERNEL_RT =  ztrsm_kernel_RT.S
+endif
+
+ifndef ZTRSMKERNEL_LN
+ZTRSMKERNEL_LN =  ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_LT
+ZTRSMKERNEL_LT =  ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_RN
+ZTRSMKERNEL_RN =  ztrsm_kernel_LT.S
+endif
+
+ifndef ZTRSMKERNEL_RT
+ZTRSMKERNEL_RT =  ztrsm_kernel_RT.S
+endif
+
+ifndef CGEMM3MKERNEL
+CGEMM3MKERNEL    =  zgemm3m_kernel.S
+endif
+
+ifndef ZGEMM3MKERNEL
+ZGEMM3MKERNEL    =  zgemm3m_kernel.S
+endif
+
+DSDOTKERNEL  = dot.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
new file mode 100644
index 000000000..bb0441ab2
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -0,0 +1,14 @@
+DGEMMKERNEL    = dgemm_kernel_16x4.S
+DGEMMINCOPY    = dgemm_ncopy_16.S
+DGEMMITCOPY    = dgemm_tcopy_16.S
+DGEMMONCOPY    = dgemm_ncopy_4.S
+DGEMMOTCOPY    = dgemm_tcopy_4.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
new file mode 100644
index 000000000..105b2f6fd
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.generic
@@ -0,0 +1,167 @@
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL    = ../generic/trmmkernel_2x2.c
+DTRMMKERNEL    = ../generic/trmmkernel_2x2.c
+CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN  =  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT  =  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN  =  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT  =  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+#Pure C for other kernels
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL   = ../arm/sum.c
+DSUMKERNEL   = ../arm/sum.c
+CSUMKERNEL   = ../arm/zsum.c
+ZSUMKERNEL   = ../arm/zsum.c
+
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../generic/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+QSYMV_U_KERNEL =  ../generic/symv_k.c
+QSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL   = ../generic/lsame.c
+SCABS_KERNEL   = ../generic/cabs.c
+DCABS_KERNEL   = ../generic/cabs.c
+QCABS_KERNEL   = ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL  = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL  = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/loongarch64/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S
new file mode 100644
index 000000000..4b135c522
--- /dev/null
+++ b/kernel/loongarch64/amax.S
@@ -0,0 +1,230 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+
+#define I      $r17
+#define TEMP   $r18
+
+#define a1 $f10
+#define a2 $f11
+#define a3 $f12
+#define a4 $f13
+#define a5 $f14
+#define a6 $f15
+#define a7 $f16
+#define a8 $f17
+
+#define t1 $f0
+#define t2 $f1
+#define t3 $f2
+#define t4 $f3
+
+#define s1 $f22
+#define s2 $f8
+#define s3 $f23
+#define s4 $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+
+   add.d   X, X, INCX
+   FABS    s1, a1
+
+   FABS   s2, a1
+   bge $r0,    N, .L999
+
+   FABS    s3, a1
+   srai.d  I, N, 3
+
+   FABS   s4, a1
+   bge $r0,    I, .L15
+
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+
+   FABS    t1, a1
+
+   CMPLT   $fcc0, s1, t1
+
+   CMOVT  s1,  s1,  t1,  $fcc0
+
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+  move $r4, $r17
+  fmov.d $f0, $f22
+  jirl    $r0, $r1, 0x0
+
+  EPILOGUE
diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S
new file mode 100644
index 000000000..ff9978f26
--- /dev/null
+++ b/kernel/loongarch64/amin.S
@@ -0,0 +1,186 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   FABS    s1, a1
+   FABS    s2, a1
+   bge     $r0,    N, .L999
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS   s4, a1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+.L15:
+   andi    I,  N, 7
+NOP
+   bge $r0,    I, .L998
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   EPILOGUE
diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S
new file mode 100644
index 000000000..7d21ce038
--- /dev/null
+++ b/kernel/loongarch64/asum.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1 $f23
+#define a2 $f9
+#define a3 $f10
+#define a4 $f11
+#define a5 $f12
+#define a6 $f13
+#define a7 $f14
+#define a8 $f15
+#define t1 $f16
+#define t2 $f17
+#define t3 $f0
+#define t4 $f1
+#define s1 $f22
+#define s2 $f8
+   PROLOGUE
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+   MTC  s1, $r0
+   MTC  s2, $r0
+   slli.d  INCX, INCX, BASE_SHIFT
+   li.d TEMP, SIZE
+   bge $r0,    N, .L999
+   srai.d  I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   FABS    t1, a1
+   LD a6,  X,   5 * SIZE
+   FABS    t2, a2
+   LD a7,  X,   6 * SIZE
+   FABS    t3, a3
+   FABS    t4, a4
+   addi.d  I, I, -1
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+.L12:
+   ADD s1, s1, t1
+   LD a1,  X,   8 * SIZE
+   FABS    t1, a5
+   addi.d  I, I, -1
+   ADD s2, s2, t2
+   LD a2,  X,   9 * SIZE
+   FABS    t2, a6
+   NOP
+   ADD s1, s1, t3
+   LD a3,  X,  10 * SIZE
+   FABS    t3, a7
+   NOP
+   ADD s2, s2, t4
+   LD a4,  X,  11 * SIZE
+   FABS    t4, a8
+   addi.d  X, X, 8 * SIZE
+   ADD s1, s1, t1
+   LD a5,  X,   4 * SIZE
+   FABS    t1, a1
+   NOP
+   ADD s2, s2, t2
+   LD a6,  X,   5 * SIZE
+   FABS    t2, a2
+   NOP
+   ADD s1, s1, t3
+   LD a7,  X,   6 * SIZE
+   FABS    t3, a3
+   NOP
+   ADD s2, s2, t4
+   LD a8,  X,   7 * SIZE
+   FABS   t4, a4
+   blt $r0,    I, .L12
+   .align 3
+.L13:
+   ADD s1, s1, t1
+   addi.d  X, X, 8 * SIZE
+   FABS    t1, a5
+   NOP
+   ADD s2, s2, t2
+   FABS    t2, a6
+   ADD s1, s1, t3
+   FABS    t3, a7
+   ADD s2, s2, t4
+   FABS    t4, a8
+   ADD s1, s1, t1
+   ADD s2, s2, t2
+   ADD s1, s1, t3
+   ADD s2, s2, t4
+   .align 3
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   ADD s1, s1, t1
+   addi.d X, X, SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+.L20:
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   LD a7,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a8,  X,   0 * SIZE
+   FABS    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+.L23:
+   ADD s1, s1, t1
+   LD a1,  X,   0 * SIZE
+   FABS    t1, a5
+   add.d   X, X, INCX
+   ADD s2, s2, t2
+   LD a2,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a3,  X,   0 * SIZE
+   FABS    t3, a7
+   add.d   X, X, INCX
+   ADD s2, s2, t4
+   LD a4,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   ADD s1, s1, t1
+   LD a5,  X,   0 * SIZE
+   FABS    t1, a1
+   add.d   X, X, INCX
+   ADD s2, s2, t2
+   LD a6,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a7,  X,   0 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   ADD s2, s2, t4
+   LD a8,  X,   0 * SIZE
+   FABS    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L23
+   .align 3
+.L24:
+   ADD s1, s1, t1
+   FABS    t1, a5
+   ADD s2, s2, t2
+   FABS    t2, a6
+   ADD s1, s1, t3
+   FABS    t3, a7
+   ADD s2, s2, t4
+   FABS    t4, a8
+   ADD s1, s1, t1
+   ADD s2, s2, t2
+   ADD s1, s1, t3
+   ADD s2, s2, t4
+   .align 3
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   add.d   X, X, INCX
+   ADD    s1, s1, t1
+   blt $r0,    I, .L26
+   .align 3
+.L999:
+   ADD s1, s1, s2
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   EPILOGUE
diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
new file mode 100644
index 000000000..9d27987e1
--- /dev/null
+++ b/kernel/loongarch64/cnrm2.S
@@ -0,0 +1,159 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define a5     $f16
+#define a6     $f17
+#define a7     $f0
+#define a8     $f1
+#define s1     $f22
+#define s2     $f8
+#define t1     $f23
+#define t2     $f9
+#define t3     $f10
+#define t4     $f11
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   movgr2fr.d  s1,  $r0
+   li.d  TEMP, 2 * SIZE
+   fmov.d s2, s1
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   srai.d  I, N, 2
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   fcvt.d.s    t1, a1
+   LD a7,  X,   0 * SIZE
+   fcvt.d.s    t2, a2
+   LD a8,  X,   1 * SIZE
+   fcvt.d.s    t3, a3
+   addi.d  I, I, -1
+   fcvt.d.s    t4, a4
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   fmadd.d  s1, t1, t1, s1
+   LD a1,  X,   0 * SIZE
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   LD a2,  X,   1 * SIZE
+   fcvt.d.s    t2, a6
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a3,  X,   0 * SIZE
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   LD a4,  X,   1 * SIZE
+   fcvt.d.s    t4, a8
+   add.d   X, X, INCX
+   fmadd.d  s1, t1, t1, s1
+   LD a5,  X,   0 * SIZE
+   fcvt.d.s    t1, a1
+   addi.d  I, I, -1
+   fmadd.d  s2, t2, t2, s2
+   LD a6,  X,   1 * SIZE
+   fcvt.d.s    t2, a2
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a7,  X,   0 * SIZE
+   fcvt.d.s    t3, a3
+   LD a8,  X,   1 * SIZE
+   fmadd.d  s2, t4, t4, s2
+   add.d   X, X, INCX
+   fcvt.d.s   t4, a4
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   fmadd.d  s1, t1, t1, s1
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   fcvt.d.s    t2, a6
+   fmadd.d  s1, t3, t3, s1
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   fcvt.d.s    t4, a8
+   fmadd.d  s1, t1, t1, s1
+   fmadd.d  s2, t2, t2, s2
+   fmadd.d  s1, t3, t3, s1
+   fmadd.d  s2, t4, t4, s2
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   fcvt.d.s    t2, a2
+   fmadd.d  s1, t1, t1, s1
+   add.d   X, X, INCX
+   fmadd.d  s2, t2, t2, s2
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   fadd.d  s1, s1, s2
+   fsqrt.d s1, s1
+   move $r4, $r17
+   fcvt.s.d    $f0, s1
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S
new file mode 100644
index 000000000..3156f60b8
--- /dev/null
+++ b/kernel/loongarch64/copy.S
@@ -0,0 +1,225 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   li.d  TEMP, SIZE
+   NOP
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    N, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bne INCX, TEMP, .L20
+   srai.d I, N, 3
+   bne INCY, TEMP, .L20
+   addi.d I, I, -1
+   blt     I, $r0,     .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   8 * SIZE
+   ST a2,  Y,   1 * SIZE
+   LD a2,  X,   9 * SIZE
+   ST a3,  Y,   2 * SIZE
+   LD a3,  X,  10 * SIZE
+   ST a4,  Y,   3 * SIZE
+   LD a4,  X,  11 * SIZE
+   ST a5,  Y,   4 * SIZE
+   LD a5,  X,  12 * SIZE
+   ST a6,  Y,   5 * SIZE
+   LD a6,  X,  13 * SIZE
+   ST a7,  Y,   6 * SIZE
+   LD a7,  X,  14 * SIZE
+   ST a8,  Y,   7 * SIZE
+   LD a8,  X,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   ST a3,  Y,   2 * SIZE
+   ST a4,  Y,   3 * SIZE
+   ST a5,  Y,   4 * SIZE
+   ST a6,  Y,   5 * SIZE
+   ST a7,  Y,   6 * SIZE
+   ST a8,  Y,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   addi.d  Y, Y, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  X, X, SIZE
+   addi.d  I, I, -1
+   addi.d  Y, Y, SIZE
+   ST a1,  Y,  -1 * SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+   srai.d  I, N, 3
+   addi.d I, I, -1
+   blt I,  $r0, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   bge $r0,    I, .L23
+   .align 3
+
+.L22:
+   ST a1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a8,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST a1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a8,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   ST a1,  Y,   0 * SIZE
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S
new file mode 100644
index 000000000..13faa977e
--- /dev/null
+++ b/kernel/loongarch64/dgemm_kernel_16x4.S
@@ -0,0 +1,4250 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25 /* !! DO NOT USE $r21 and $r22 !! */
+#define T1     $r26
+#define T2     $r27
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define D0     $xr7
+#define D1     $xr8
+#define D2     $xr9
+#define D3     $xr10
+#define D4     $xr11
+#define D5     $xr12
+#define D6     $xr13
+#define D7     $xr14
+#define D8     $xr15
+#define D9     $xr16
+#define D10    $xr17
+#define D11    $xr18
+#define D12    $xr19
+#define D13    $xr20
+#define D14    $xr21
+#define D15    $xr22
+#define VALPHA $xr23
+
+/* Prefetch interval */
+#define A_PRE  0x200
+#define B_PRE  0x100
+
+    PROLOGUE
+
+    addi.d   $sp,   $sp,   -56
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f23,  $sp,   40
+    ST       ALPHA, $sp,   48
+
+    /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */
+    xvld         VALPHA, $sp,  48
+    xvreplve0.d  VALPHA, VALPHA
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    beq      ZERO,  J,     .L_N3
+
+.L_J1: /* J-- && This loop include Condition 1 */
+
+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
+
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    preld          0,  C0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    preld     0,   C0, 0x40
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    preld          0,   C1,    0x00
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+    preld     0,   C1,    0x40
+    xvfmul.d  D6,  U2, U4
+    xvfmul.d  D7,  U3, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    preld          0,   C2,    0x00
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D9,  U1, U4
+    preld     0,   C2,    0x40
+    xvfmul.d  D10, U2, U4
+    xvfmul.d  D11, U3, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    preld          0,   C3,    0x00
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+    xvfmul.d  D13, U1, U4
+    preld     0,   C3,    0x40
+    xvfmul.d  D14, U2, U4
+    xvfmul.d  D15, U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    /* Calculate 8 sets of D0~D15 */
+.L_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+    preld      0,   B0, B_PRE
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+    preld      0,   A0, A_PRE
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+    preld      0,   A0, A_PRE + 0x40
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_TL1
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_L0
+
+.L_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D10, U2, U4, D10
+    xvfmadd.d  D11, U3, U4, D11
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D14, U2, U4, D14
+    xvfmadd.d  D15, U3, U4, D15
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_L71
+
+.L_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D10,  D10, VALPHA
+    xvfmul.d  D11,  D11, VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+    xvfmul.d  D14,  D14, VALPHA
+    xvfmul.d  D15,  D15, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvld      U2,  C1,  0x40
+    xvld      U3,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvfmadd.d D6,  D6,  VALPHA,  U2
+    xvfmadd.d D7,  D7,  VALPHA,  U3
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvld      U1,  C2,  0x20
+    xvld      U2,  C2,  0x40
+    xvld      U3,  C2,  0x60
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvfmadd.d D9,  D9,  VALPHA,  U1
+    xvfmadd.d D10, D10, VALPHA,  U2
+    xvfmadd.d D11, D11, VALPHA,  U3
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvld      U2,  C3,  0x40
+    xvld      U3,  C3,  0x60
+    xvfmadd.d D12, D12, VALPHA,  U0
+    xvfmadd.d D13, D13, VALPHA,  U1
+    xvfmadd.d D14, D14, VALPHA,  U2
+    xvfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    xvst      D10, C2,  0x40
+    xvst      D11, C2,  0x60
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+    xvst      D14, C3,  0x40
+    xvst      D15, C3,  0x60
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+    addi.d    C2,  C2,  0x80
+    addi.d    C3,  C3,  0x80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    addi.d    L,   L,   -16
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x07
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF, OFF, 0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
+
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif  // #if defined(TRMMKERNEL)
+
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D9,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+    xvfmul.d  D13, U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
+
+.L_M8_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M8_TL1
+
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M8_L0
+
+.L_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D9,  U1, U4, D9
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d  D13, U1, U4, D13
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M8_L71
+
+.L_M8_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvld      U1,  C2,  0x20
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvfmadd.d D9,  D9,  VALPHA,  U1
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvfmadd.d D12, D12, VALPHA,  U0
+    xvfmadd.d D13, D13, VALPHA,  U1
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -8
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 8)) End************/
+
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+
+.L_M4_TL1: /* TL-- */
+           /***8-1***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M4_TL1
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+
+.L_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+
+.L_M4_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U0
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+.L_M2_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+
+.L_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+
+.L_M2_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U0
+#endif   // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+    xvstelm.d D8,  C2,  0x00,    0x00
+    xvstelm.d D12, C3,  0x00,    0x00
+    xvstelm.d D0,  C0,  0x08,    0x01
+    xvstelm.d D4,  C1,  0x08,    0x01
+    xvstelm.d D8,  C2,  0x08,    0x01
+    xvstelm.d D12, C3,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+
+.L_M1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+
+.L_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+
+.L_M1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+
+    /* Load C2  */
+    xvld      U0,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U0
+
+    /* Load C3  */
+    xvld      U0,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U0
+#endif   // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+    xvstelm.d D8,  C2,  0x00,    0x00
+    xvstelm.d D12, C3,  0x00,    0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+    addi.d    C2,  C2,  0x08
+    addi.d    C3,  C3,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -1
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 1) ) End************/
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    /* since the array type is double,
+     * so we must mul 32
+     */
+    slli.d    T0,   K,   5
+    slli.d    T1,   LDC, 5
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x04
+#endif
+
+    blt      ZERO,  J,   .L_J1
+
+//////////////// go back to L_J1 /////////////////
+/////////////////////////////////////////////////
+/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/
+
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N3_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N3_M8
+
+.L_N3_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+    xvfmul.d  D6,  U2, U4
+    xvfmul.d  D7,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
+
+.L_N3_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_TL1
+
+.L_N3_L7:
+    /* if (!(L & 7)) goto L_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_L0
+
+.L_N3_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D6,  U2, U4, D6
+    xvfmadd.d  D7,  U3, U4, D7
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_L71
+
+.L_N3_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvld      U2,  C1,  0x40
+    xvld      U3,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvfmadd.d D6,  D6,  VALPHA,  U2
+    xvfmadd.d D7,  D7,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -16
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x07
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N3_I1
+
+.L_N3_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N3_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N3_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D5,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M8_L7 */
+    beq       ZERO,TL, .L_N3_M8_L7
+
+.L_N3_M8_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M8_TL1
+
+.L_N3_M8_L7:
+    /* if (!(L & 7)) goto L_N3_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M8_L0
+
+.L_N3_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+    xvfmadd.d  D5,  U1, U4, D5
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M8_L71
+
+.L_N3_M8_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvld      U1,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvfmadd.d D5,  D5,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2) && (M & 8) ) End************/
+
+.L_N3_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
+
+.L_N3_M4_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    /* Cumulative D0~D15 */
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M4_TL1
+
+.L_N3_M4_L7:
+    /* if (!(L & 7)) goto L_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M4_L0
+
+.L_N3_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M4_L71
+
+.L_N3_M4_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 4) ) End************/
+
+.L_N3_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N3_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M2_L7 */
+    beq       ZERO,TL, .L_N3_M2_L7
+
+.L_N3_M2_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M2_TL1
+
+.L_N3_M2_L7:
+    /* if (!(L & 7)) goto L_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M2_L0
+
+.L_N3_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M2_L71
+
+.L_N3_M2_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+    xvstelm.d D0,  C0,  0x08,    0x01
+    xvstelm.d D4,  C1,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 2) ) End************/
+
+.L_N3_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N3_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
+
+.L_N3_M1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M1_TL1
+
+.L_N3_M1_L7:
+    /* if (!(L & 7)) goto L_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M1_L0
+
+.L_N3_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M1_L71
+
+.L_N3_M1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    xvld      U0,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U0
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D4,  C1,  0x00,    0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 1) ) End************/
+
+.L_N3_M0:
+    /* Add stride for B and C
+     * B += (K * 16)
+     * C += (LDC * 16)
+     */
+    /* since the array type is double,
+     * so we must mul 16
+     */
+    slli.d    T0,   K,   4
+    slli.d    T1,   LDC, 4
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x02
+#endif
+
+    /* We must reinit I */
+    srai.d   I,     M,   4     /* I = bm >> 4 */
+
+/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+
+/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N1_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M8
+
+.L_N1_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_L7 */
+    beq       ZERO,TL, .L_N1_L7
+
+.L_N1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_TL1
+
+.L_N1_L7:
+    /* if (!(L & 7)) goto L_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_L0
+
+.L_N1_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_L71
+
+.L_N1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x80
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -16
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x07
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
+
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
+
+.L_N1_M8_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M8_TL1
+
+.L_N1_M8_L7:
+    /* if (!(L & 7)) goto L_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M8_L0
+
+.L_N1_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M8_L71
+
+.L_N1_M8_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 8) ) End************/
+
+.L_N1_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+
+.L_N1_M4_TL1: /* TL-- */
+           /***8-1***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+
+.L_N1_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+
+.L_N1_M4_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 4) ) End************/
+
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+
+.L_N1_M2_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+
+.L_N1_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+
+.L_N1_M2_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C0,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 2) ) End************/
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+.L_N1_M1_TL1: /* TL-- */
+           /***8-1***/
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-2***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-3***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-4***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-5***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-6***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-7***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+           /***8-8***/
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+
+.L_N1_M1_L0:
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 1) ) End************/
+
+.L_N1_M0:
+
+/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+.L_N0:
+    /* Restore regs */
+    LDARG    $r23,  $sp,   0
+    LDARG    $r24,  $sp,   8
+    LDARG    $r25,  $sp,   16
+    LDARG    $r26,  $sp,   24
+    LDARG    $r27,  $sp,   32
+    LD       $f23,  $sp,   40
+    addi.d   $sp,   $sp,   56
+
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S
new file mode 100644
index 000000000..95c879031
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_16.S
@@ -0,0 +1,691 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -0x90
+    SDARG      $r23, $sp,  0x00
+    SDARG      $r24, $sp,  0x08
+    SDARG      $r25, $sp,  0x10
+    SDARG      $r26, $sp,  0x18
+    SDARG      $r27, $sp,  0x20
+    SDARG      $r28, $sp,  0x28
+    SDARG      $r29, $sp,  0x30
+    SDARG      $r30, $sp,  0x38
+    SDARG      $r31, $sp,  0x40
+    ST         $f23, $sp,  0x48
+    ST         $f24, $sp,  0x50
+    ST         $f25, $sp,  0x58
+    ST         $f26, $sp,  0x60
+    ST         $f27, $sp,  0x68
+    ST         $f28, $sp,  0x70
+    ST         $f29, $sp,  0x78
+    ST         $f30, $sp,  0x80
+    ST         $f31, $sp,  0x88
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x04
+    beq        J,    ZERO, .L_N8
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      S9,   S7,   T0
+    add.d      S10,  S8,   T0
+    add.d      S11,  S9,   T0
+    add.d      S12,  S10,  T0
+    add.d      S13,  S11,  T0
+    add.d      S14,  S12,  T0
+    add.d      S15,  S13,  T0
+    add.d      S16,  S14,  T0
+    add.d      TS,   S15,  T0
+    beq        I,    ZERO, .L_I7
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+    xvld       U8,   S9,   0x20
+    xvld       U9,   S10,  0x20
+    xvld       U10,  S11,  0x20
+    xvld       U11,  S12,  0x20
+    xvld       U12,  S13,  0x20
+    xvld       U13,  S14,  0x20
+    xvld       U14,  S15,  0x20
+    xvld       U15,  S16,  0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+    addi.d     S9,   S9,   0x40
+    addi.d     S10,  S10,  0x40
+    addi.d     S11,  S11,  0x40
+    addi.d     S12,  S12,  0x40
+    addi.d     S13,  S13,  0x40
+    addi.d     S14,  S14,  0x40
+    addi.d     S15,  S15,  0x40
+    addi.d     S16,  S16,  0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    fld.d     F0,    S9,  0x00
+    fld.d     F1,    S10, 0x00
+    fld.d     F2,    S11, 0x00
+    fld.d     F3,    S12, 0x00
+    fld.d     F4,    S13, 0x00
+    fld.d     F5,    S14, 0x00
+    fld.d     F6,    S15, 0x00
+    fld.d     F7,    S16, 0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S9,    S9,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S10,   S10, 0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S11,   S11, 0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S12,   S12, 0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S13,   S13, 0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S14,   S14, 0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S15,   S15, 0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S16,   S16, 0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+
+.L_N8:
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N4
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_8I3
+
+.L_8I1:  /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_8I1
+
+.L_8I3:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N4
+
+.L_8I11:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+
+    addi.d    TD,    TD,  0x40
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_8I11
+
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+
+.L_4I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 1
+    xvpermi.q  D2,   U0,   0x31  // 2
+    xvpermi.q  D3,   U1,   0x31  // 3
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+
+.L_2I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+
+    xvpermi.q  D0,   D1,   0x02  // 0
+
+    xvst       D0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+
+.L_N0:
+    LDARG      $r23, $sp,  0x00
+    LDARG      $r24, $sp,  0x08
+    LDARG      $r25, $sp,  0x10
+    LDARG      $r26, $sp,  0x18
+    LDARG      $r27, $sp,  0x20
+    LDARG      $r28, $sp,  0x28
+    LDARG      $r29, $sp,  0x30
+    LDARG      $r30, $sp,  0x38
+    LDARG      $r31, $sp,  0x40
+    LD         $f23, $sp,  0x48
+    LD         $f24, $sp,  0x50
+    LD         $f25, $sp,  0x58
+    LD         $f26, $sp,  0x60
+    LD         $f27, $sp,  0x68
+    LD         $f28, $sp,  0x70
+    LD         $f29, $sp,  0x78
+    LD         $f30, $sp,  0x80
+    LD         $f31, $sp,  0x88
+    addi.d     $sp,  $sp,  0x90
+    jirl       $r0,  $r1,  0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S
new file mode 100644
index 000000000..b1f322a06
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_4.S
@@ -0,0 +1,237 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr14
+#define D1     $xr8
+#define D2     $xr9
+#define D3     $xr10
+#define D4     $xr11
+#define D5     $xr12
+#define D6     $xr13
+#define D7     $xr15
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO, .L_N2
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    addi.d     J,    J,    -1
+    beq        I,    ZERO, .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 1
+    xvpermi.q  D2,   U0,   0x31  // 2
+    xvpermi.q  D3,   U1,   0x31  // 3
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_I0
+
+.L_II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_2I3
+
+.L_2I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D1,   0x02  // 0
+    xvpermi.q  D1,   U0,   0x31  // 1
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+
+.L_2I3:
+    andi       I,    M,    0x03
+    beq        ZERO, I,    .L_N1
+
+.L_2II1: /* I-- */
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S2,   0x00
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    fst.d      F1,   TD,   0x08
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+    blt        ZERO, I,    .L_2II1
+
+.L_N1:
+    andi       J,    N,    0x01
+    beq        ZERO, J,    .L_N0
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+    beq        ZERO, I,    .L_1I3
+
+.L_1I1:
+    xvld       U0,   S1,   0x00
+    addi.d     S1,   S1,   0x20
+    xvst       U0,   TD,   0x00
+    addi.d     I,    I,    -1
+    addi.d     TD,   TD,   0x20
+    blt        ZERO, I,    .L_1I1
+
+.L_1I3:
+    andi       I,    M,    0x03
+    beq        ZERO, I,    .L_N0
+
+.L_1II1:
+    fld.d      F0,   S1,   0x00
+    addi.d     S1,   S1,   0x08
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    addi.d     TD,   TD,   0x08
+    blt        ZERO, I,    .L_1II1
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S
new file mode 100644
index 000000000..afafe5b37
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_16.S
@@ -0,0 +1,710 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -56
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    SDARG      $r28,   $sp,   40
+    SDARG      $r29,   $sp,   48
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x04
+    srai.d     T1,     N,     0x03
+    slli.d     T0,     T0,    0x04
+    slli.d     T1,     T1,    0x03
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T0,     T0,    0x02
+    slli.d     T1,     T1,    0x01
+    mul.d      P4,     M,     T0
+    mul.d      P5,     M,     T1
+    slli.d     P4,     P4,    0x03
+    slli.d     P5,     P5,    0x03
+    add.d      P4,     DST,   P4
+    add.d      P5,     DST,   P5
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x07
+    beq        ZERO,   J,     .L_M7
+
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x400
+
+    srai.d     I,      N,     0x04
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N15
+
+.L_I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S5,    0x40
+    xvld       U3,     S5,    0x60
+    xvld       U4,     S6,    0x00
+    xvld       U5,     S6,    0x20
+    xvld       U6,     S6,    0x40
+    xvld       U7,     S6,    0x60
+
+    xvst       U0,     P1,    0x200
+    xvst       U1,     P1,    0x220
+    xvst       U2,     P1,    0x240
+    xvst       U3,     P1,    0x260
+    xvst       U4,     P1,    0x280
+    xvst       U5,     P1,    0x2A0
+    xvst       U6,     P1,    0x2C0
+    xvst       U7,     P1,    0x2E0
+
+    xvld       U0,     S7,    0x00
+    xvld       U1,     S7,    0x20
+    xvld       U2,     S7,    0x40
+    xvld       U3,     S7,    0x60
+    xvld       U4,     S8,    0x00
+    xvld       U5,     S8,    0x20
+    xvld       U6,     S8,    0x40
+    xvld       U7,     S8,    0x60
+
+    xvst       U0,     P1,    0x300
+    xvst       U1,     P1,    0x320
+    xvst       U2,     P1,    0x340
+    xvst       U3,     P1,    0x360
+    xvst       U4,     P1,    0x380
+    xvst       U5,     P1,    0x3A0
+    xvst       U6,     P1,    0x3C0
+    xvst       U7,     P1,    0x3E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     S5,     S5,    0x80
+    addi.d     S6,     S6,    0x80
+    addi.d     S7,     S7,    0x80
+    addi.d     S8,     S8,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U0,     P2,    0x100
+    xvst       U1,     P2,    0x120
+    xvst       U2,     P2,    0x140
+    xvst       U3,     P2,    0x160
+    xvst       U4,     P2,    0x180
+    xvst       U5,     P2,    0x1A0
+    xvst       U6,     P2,    0x1C0
+    xvst       U7,     P2,    0x1E0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     P2,     P2,    0x200
+
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+    xvst       U4,     P3,    0x80
+    xvst       U5,     P3,    0xA0
+    xvst       U6,     P3,    0xC0
+    xvst       U7,     P3,    0xE0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P3,     P3,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+    xvpermi.q  U4,     U5,    0x02
+    xvpermi.q  U6,     U7,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+    xvst       U4,     P4,    0x40
+    xvst       U6,     P4,    0x60
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P4,     P4,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+    fld.d      F4,     S5,    0x00
+    fld.d      F5,     S6,    0x00
+    fld.d      F6,     S7,    0x00
+    fld.d      F7,     S8,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+    fst.d      F2,     P5,    0x10
+    fst.d      F3,     P5,    0x18
+    fst.d      F4,     P5,    0x20
+    fst.d      F5,     P5,    0x28
+    fst.d      F6,     P5,    0x30
+    fst.d      F7,     P5,    0x38
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P5,     P5,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_4N15
+
+.L_4I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+
+.L_4N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_4N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     P2,     P2,    0x100
+
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P3,     P3,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P4,     P4,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+    fst.d      F2,     P5,    0x10
+    fst.d      F3,     P5,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P5,     P5,    0x20
+
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_2N15
+
+.L_2I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_2N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     P2,     P2,    0x80
+
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P3,     P3,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+
+    xvst       U0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P4,     P4,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P5,     P5,    0x10
+
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_1N15
+
+.L_1I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    addi.d     S1,     S1,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_1N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+
+    addi.d     S1,     S1,    0x40
+    addi.d     P2,     P2,    0x40
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P3,     P3,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S1,    0x08
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P4,     P4,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P5,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P5,     P5,    0x08
+
+.L_M0:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LDARG      $r28,   $sp,   40
+    LDARG      $r29,   $sp,   48
+    addi.d     $sp,    $sp,   56
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S
new file mode 100644
index 000000000..700989ca1
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_4.S
@@ -0,0 +1,270 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define P0     $r16
+#define P1     $r17
+#define P2     $r18
+#define P3     $r19
+#define T0     $r20
+#define T1     $r23
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -8
+    SDARG      $r23,   $sp,   0
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x02
+    slli.d     T0,     T0,    0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T1,     T1,    0x01
+    mul.d      T0,     M,     T0
+    mul.d      T1,     M,     T1
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x03
+    add.d      P2,     DST,   T0
+    add.d      P3,     DST,   T1
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x02
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x05
+    beq        ZERO,   J,     .L_M3
+
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x02
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N3
+
+.L_I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    add.d      P1,     P1,    T1
+
+    addi.d     I,      I,    -1
+    blt        ZERO,   I,    .L_I1
+
+.L_N3:
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+
+    xvst       U0,     P2,    0x00
+    xvst       U2,     P2,    0x20
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P2,     P2,    0x40
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P3,    0x00
+    fst.d      F1,     P3,    0x08
+    fst.d      F2,     P3,    0x10
+    fst.d      F3,     P3,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P3,     P3,    0x20
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M3:
+    andi       J,      M,      0x02
+    beq        ZERO,   J,      .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,     TL
+    add.d      S0,     S0,     T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,     0x40
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_2N3
+
+.L_2I1:   /* I-- */
+    xvld       U0,     S1,     0x00
+    xvld       U1,     S2,     0x00
+
+    xvst       U0,     P1,     0x00
+    xvst       U1,     P1,     0x20
+
+    addi.d     S1,     S1,     0x20
+    addi.d     S2,     S2,     0x20
+    addi.d     I,      I,      -1
+    add.d      P1,     P1,     T1
+
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    xvld       U0,     S1,     0x00
+    xvld       U1,     S2,     0x00
+
+    xvpermi.q  U0,     U1,     0x02
+
+    xvst       U0,     P2,     0x00
+
+    addi.d     S1,     S1,     0x10
+    addi.d     S2,     S2,     0x10
+    addi.d     P2,     P2,     0x20
+
+.L_2N1:
+    addi.d     I,      N,      0x01
+    beq        ZERO,   I,      .L_M1
+
+    fld.d      F0,     S1,     0x00
+    fld.d      F1,     S2,     0x00
+
+    fst.d      F0,     P3,     0x00
+    fst.d      F1,     P3,     0x08
+
+    addi.d     S1,     S1,     0x08
+    addi.d     S2,     S2,     0x08
+    addi.d     P3,     P3,     0x10
+
+.L_M1:
+    andi       J,      M,      0x01
+    beq        ZERO,   J,      .L_M0
+
+    move       S1,     S0
+    move       P1,     P0
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_1N3
+
+.L_1I1:
+    xvld       U0,    S1,      0x00
+
+    xvst       U0,    P1,      0x00
+
+    addi.d     S1,    S1,      0x20
+    addi.d     I,     I,       -1
+    add.d      P1,    P1,      T1
+
+    blt        ZERO,  I,       .L_1I1
+
+.L_1N3:
+    andi       I,     N,       0x02
+    beq        I,     ZERO,    .L_1N1
+
+    fld.d      F0,    S1,      0x00
+    fld.d      F1,    S1,      0x08
+
+    fst.d      F0,    P2,      0x00
+    fst.d      F1,    P2,      0x08
+
+    addi.d     S1,    S1,      0x10
+    addi.d     P2,    P2,      0x10
+
+.L_1N1:
+    andi       I,     N,       0x01
+    beq        I,     ZERO,    .L_M0
+
+    fld.d      F0,    S1,      0x00
+
+    fst.d      F0,    P3,      0x00
+
+.L_M0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S
new file mode 100644
index 000000000..41db48bdf
--- /dev/null
+++ b/kernel/loongarch64/dnrm2.S
@@ -0,0 +1,314 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r7
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define ALPHA  $f4
+#define max    $f5
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   move    XX, X
+   NOP
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   FABS    s1, a1
+   FABS   s2, a1
+   bge $r0,    N, .L999
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS   s4, a1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L100
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L100:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   addi.d  N, N, 1
+   lu12i.w TEMP, 0x3f800
+   movgr2fr.d      a1,   $r0
+   movgr2fr.w  ALPHA, TEMP
+   CMPEQ   $fcc0, s1, a1
+   fcvt.d.s   ALPHA, ALPHA
+   bcnez   $fcc0, .L999
+   fdiv.d  ALPHA, ALPHA, s1
+   MOV max, s1
+   MOV s1, a1
+   MOV s2, a1
+   MOV s3, a1
+   MOV s4, a1
+   srai.d  I, N, 3
+   bge $r0,    I, .L105
+   LD a1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a5,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a6,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a7,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD a8,  XX,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   bge $r0,    I, .L104
+   .align 3
+
+.L103:
+   MUL t1, ALPHA, a1
+   LD a1,  XX,   0 * SIZE
+   MUL t2, ALPHA, a2
+   add.d   XX, XX, INCX
+   MUL t3, ALPHA, a3
+   LD a2,  XX,   0 * SIZE
+   MUL t4, ALPHA, a4
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a3,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   add.d   XX, XX, INCX
+   MADD  s3, t3, t3, s3
+   LD a4,  XX,   0 * SIZE
+   MADD  s4, t4, t4, s4
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   LD a5,  XX,   0 * SIZE
+   MUL t2, ALPHA, a6
+   add.d   XX, XX, INCX
+   MUL t3, ALPHA, a7
+   LD a6,  XX,   0 * SIZE
+   MUL t4, ALPHA, a8
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a7,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   add.d   XX, XX, INCX
+   MADD  s3, t3, t3, s3
+   LD a8,  XX,   0 * SIZE
+   MADD  s4, t4, t4, s4
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   blt $r0,    I, .L103
+   .align 3
+
+.L104:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   MUL t1, ALPHA, a5
+   MUL t2, ALPHA, a6
+   MUL t3, ALPHA, a7
+   MUL t4, ALPHA, a8
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   .align 3
+
+.L105:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L106:
+   LD a1,  XX,   0 * SIZE
+   addi.d  I, I, -1
+   MUL t1, ALPHA, a1
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   blt $r0,    I, .L106
+   .align 3
+
+.L998:
+   ADD s1, s1, s2
+   ADD s3, s3, s4
+   ADD s1, s1, s3
+   fsqrt.d s1, s1
+   move $r4, $r17
+   MUL $f0, max, s1
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S
new file mode 100644
index 000000000..1e4c81a02
--- /dev/null
+++ b/kernel/loongarch64/dot.S
@@ -0,0 +1,391 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f23
+#define a2     $f9
+#define a3     $f10
+#define a4     $f11
+#define b1     $f12
+#define b2     $f13
+#define b3     $f14
+#define b4     $f15
+#define s1     $f22
+#define s2     $f8
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   MTC  s1, $r0
+   MTC  s2, $r0
+   slli.d  INCX, INCX, BASE_SHIFT
+   li.d  TEMP, SIZE
+   slli.d INCY, INCY, BASE_SHIFT
+   bge $r0,    N, .L999
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bne INCY, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b2,  Y,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD b3,  Y,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   addi.d  I, I, -1
+   LD b4,  Y,   3 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   4 * SIZE
+   LD b1,  Y,   4 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a2, a2
+   fcvt.d.s  b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   LD a2,  X,   5 * SIZE
+   LD b2,  Y,   5 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a3, a3
+   fcvt.d.s  b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+   LD a3,  X,   6 * SIZE
+   LD b3,  Y,   6 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a4, a4
+   fcvt.d.s  b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   LD a4,  X,   7 * SIZE
+   LD b4,  Y,   7 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   8 * SIZE
+   LD b1,  Y,   8 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a2, a2
+   fcvt.d.s  b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   LD a2,  X,   9 * SIZE
+   LD b2,  Y,   9 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a3, a3
+   fcvt.d.s  b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+   LD a3,  X,  10 * SIZE
+   LD b3,  Y,  10 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a4, a4
+   fcvt.d.s  b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   LD a4,  X,  11 * SIZE
+   LD b4,  Y,  11 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+.L13:
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   4 * SIZE
+   LD b1,  Y,   4 * SIZE
+#ifdef DSDOT
+   fcvt.d.s a2, a2
+   fcvt.d.s b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   LD a2,  X,   5 * SIZE
+   LD b2,  Y,   5 * SIZE
+#ifdef DSDOT
+   fcvt.d.s a3, a3
+   fcvt.d.s b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+   LD a3,  X,   6 * SIZE
+   LD b3,  Y,   6 * SIZE
+#ifdef DSDOT
+   fcvt.d.s a4, a4
+   fcvt.d.s b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   LD a4,  X,   7 * SIZE
+   LD b4,  Y,   7 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   addi.d  X, X, 8 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a2, a2
+   fcvt.d.s  b2, b2
+   fmadd.d  s2, b2, a2, s2
+#else
+   MADD  s2, b2, a2, s2
+#endif
+   addi.d  Y, Y, 8 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a3, a3
+   fcvt.d.s  b3, b3
+   fmadd.d  s1, b3, a3, s1
+#else
+   MADD  s1, b3, a3, s1
+#endif
+#ifdef DSDOT
+   fcvt.d.s  a4, a4
+   fcvt.d.s  b4, b4
+   fmadd.d  s2, b4, a4, s2
+#else
+   MADD  s2, b4, a4, s2
+#endif
+   .align 3
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   addi.d  I, I, -1
+   addi.d  X, X, SIZE
+   addi.d  Y, Y, SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+#ifdef F_INTERFACE
+   bgez    INCX, .L21
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCX
+   mflo    TEMP
+   dsub    X, X, TEMP
+   .align 3
+
+.L21:
+   bgez    INCY, .L22
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCY
+   mflo    TEMP
+   dsub    Y, Y, TEMP
+   .align 3
+
+.L22:
+#endif
+   bge $r0,    I, .L25
+   .align 3
+
+.L23:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s2, b1, a1, s2
+#else
+   MADD  s2, b1, a1, s2
+#endif
+   blt $r0,    I, .L23
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+#ifdef DSDOT
+   fcvt.d.s  a1, a1
+   fcvt.d.s  b1, b1
+   fmadd.d  s1, b1, a1, s1
+#else
+   MADD  s1, b1, a1, s1
+#endif
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+#ifdef DSDOT
+   fadd.d $f0, s1, s2
+#else
+   ADD    $f0, s1, s2
+#endif
+   move $r4, $r17
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S
new file mode 100644
index 000000000..8926bf123
--- /dev/null
+++ b/kernel/loongarch64/gemm_kernel.S
@@ -0,0 +1,1859 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r30
+#define PREFETCHSIZE (4 * 10)
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define BB     $r29
+
+#if defined(TRMMKERNEL)
+#define OFFSET $r11
+#define KK     $r20
+#define TEMP   $r16
+#endif
+
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -160
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   SDARG  $r29,  $sp,   48
+   SDARG  $r30,  $sp,   96
+   fst.d  $f24,  $sp,  56
+   fst.d  $f25,  $sp,  64
+   fst.d  $f26,  $sp,  72
+   fst.d  $f27,  $sp,  80
+   fst.d  $f28,  $sp,  88
+#if defined(TRMMKERNEL)
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#endif
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 120
+   fst.d  $f19,  $sp, 128
+   fst.d  $f20,  $sp, 136
+   fst.d  $f21,  $sp, 144
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   sub.d   KK, $r0, OFFSET
+#endif
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L30
+.L10:
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   move    AO, A
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+   add.d   C,   CO8,    LDC
+   slli.d  BB, K, 2 + BASE_SHIFT
+   add.d   BB, B, BB
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 8
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   preld  1,  CO1,  3 * SIZE
+   preld  1,  CO2,  3 * SIZE
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  K, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   preld  1,  CO3,  2 * SIZE
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD a4,  AO,   2 * SIZE
+   MADD  c61, b2, a1, c61
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD a4,  AO,   6 * SIZE
+   MADD  c61, b2, a3, c61
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   preld  1,  CO4,  3 * SIZE
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   preld  1,  CO5,  3 * SIZE
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   preld  1,  CO6,  3 * SIZE
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   preld  1,  CO7,  3 * SIZE
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   preld  1,  CO8,  3 * SIZE
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   LD $f10,  CO3,  -2 * SIZE
+   addi.d  CO5,CO5, 2 * SIZE
+   LD $f11,  CO3,  -1 * SIZE
+   addi.d  CO6,CO6, 2 * SIZE
+   LD $f12,  CO4,  -2 * SIZE
+   addi.d  CO7,CO7, 2 * SIZE
+   LD $f13,  CO4,  -1 * SIZE
+   addi.d  I, I, -1
+   MADD  c11, c11, ALPHA, $f22
+   LD $f22,  CO5, -2 * SIZE
+   MADD  c12, c12, ALPHA, $f8
+   LD $f8,  CO5, -1 * SIZE
+   MADD  c21, c21, ALPHA, $f23
+   LD $f23,  CO6, -2 * SIZE
+   MADD  c22, c22, ALPHA, $f9
+   LD $f9,  CO6, -1 * SIZE
+   MADD  c31, c31, ALPHA, $f10
+   LD $f10,  CO7, -2 * SIZE
+   MADD  c32, c32, ALPHA, $f11
+   LD $f11,  CO7, -1 * SIZE
+   MADD  c41, c41, ALPHA, $f12
+   LD $f12,  CO8,  0 * SIZE
+   MADD  c42, c42, ALPHA, $f13
+   LD $f13,  CO8,  1 * SIZE
+   preld  0,  BB,  0 * SIZE
+   preld  0,  BB,  8 * SIZE
+   ST c11,  CO1,  -2 * SIZE
+   MTC  c11, $r0
+   ST c12,  CO1,  -1 * SIZE
+   addi.d  CO8,CO8, 2 * SIZE
+   ST c21,  CO2,  -2 * SIZE
+   MOV c21, c11
+   ST c22,  CO2,  -1 * SIZE
+   addi.d  BB, BB, 16 * SIZE
+   MADD  c51, c51, ALPHA, $f22
+   ST c31,  CO3,  -2 * SIZE
+   MADD  c52, c52, ALPHA, $f8
+   ST c32,  CO3,  -1 * SIZE
+   MADD  c61, c61, ALPHA, $f23
+   ST c41,  CO4,  -2 * SIZE
+   MADD  c62, c62, ALPHA, $f9
+   ST c42,  CO4,  -1 * SIZE
+   MADD  c71, c71, ALPHA, $f10
+   ST c51,  CO5,  -2 * SIZE
+   MADD  c72, c72, ALPHA, $f11
+   ST c52,  CO5,  -1 * SIZE
+   MADD  c81, c81, ALPHA, $f12
+   ST c61,  CO6,  -2 * SIZE
+   MADD  c82, c82, ALPHA, $f13
+   ST c62,  CO6,  -1 * SIZE
+   ST c71,  CO7,  -2 * SIZE
+   MOV c31, c11
+   ST c72,  CO7,  -1 * SIZE
+   MOV c41, c11
+   ST c81,  CO8,  -2 * SIZE
+   MOV c51, c11
+   ST c82,  CO8,  -1 * SIZE
+MOV    c61, c11
+   blt $r0,    I, .L11
+#else
+   addi.d  CO4,CO4, 2 * SIZE
+   addi.d  CO5,CO5, 2 * SIZE
+   addi.d  CO6,CO6, 2 * SIZE
+   addi.d  CO7,CO7, 2 * SIZE
+   preld  0,  BB,  0 * SIZE
+   preld  0,  BB,  8 * SIZE
+   MUL c11, ALPHA, c11
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL c12, ALPHA, c12
+   MTC  a1, $r0
+   MUL c21, ALPHA, c21
+   addi.d  CO2,CO2, 2 * SIZE
+   MUL c22, ALPHA, c22
+   addi.d  CO3,CO3, 2 * SIZE
+   ST c11,  CO1,  -2 * SIZE
+   MUL c31, ALPHA, c31
+   ST c12,  CO1,  -1 * SIZE
+   MUL c32, ALPHA, c32
+   ST c21,  CO2,  -2 * SIZE
+   MUL c41, ALPHA, c41
+   ST c22,  CO2,  -1 * SIZE
+   MUL c42, ALPHA, c42
+   ST c31,  CO3,  -2 * SIZE
+   MUL c51, ALPHA, c51
+   ST c32,  CO3,  -1 * SIZE
+   MUL c52, ALPHA, c52
+   ST c41,  CO4,  -2 * SIZE
+   MUL c61, ALPHA, c61
+   ST c42,  CO4,  -1 * SIZE
+   MUL c62, ALPHA, c62
+   ST c51,  CO5,  -2 * SIZE
+   MUL c71, ALPHA, c71
+   ST c52,  CO5,  -1 * SIZE
+   MUL c72, ALPHA, c72
+   ST c61,  CO6,  -2 * SIZE
+   MUL c81, ALPHA, c81
+   ST c62,  CO6,  -1 * SIZE
+   MUL c82, ALPHA, c82
+   ST c71,  CO7,  -2 * SIZE
+   MOV c11, a1
+   ST c72,  CO7,  -1 * SIZE
+   MOV c21, a1
+   addi.d  CO8,CO8, 2 * SIZE
+   addi.d  BB, BB, 16 * SIZE
+   ST c81,  CO8,  -2 * SIZE
+   MOV c31, a1
+   ST c82,  CO8,  -1 * SIZE
+   MOV c41, a1
+   addi.d  I, I, -1
+   MOV c51, a1
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+MOV    c61, a1
+   blt $r0,    I, .L11
+#endif
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 8
+#endif
+   srai.d  L,  TEMP, 2
+MOV    c81, c11
+   bge $r0,    L, .L25
+#else
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO2,  0 * SIZE
+   LD $f23,  CO3,  0 * SIZE
+   LD $f9,  CO4,  0 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   LD $f10,  CO5,  0 * SIZE
+   MADD  c21, c21, ALPHA, $f8
+   LD $f11,  CO6,  0 * SIZE
+   MADD  c31, c31, ALPHA, $f23
+   LD $f12,  CO7,  0 * SIZE
+   MADD  c41, c41, ALPHA, $f9
+   LD $f13,  CO8,  0 * SIZE
+   MADD  c51, c51, ALPHA, $f10
+   ST c11,  CO1,   0 * SIZE
+   MADD  c61, c61, ALPHA, $f11
+   ST c21,  CO2,   0 * SIZE
+   MADD  c71, c71, ALPHA, $f12
+   ST c31,  CO3,   0 * SIZE
+   MADD  c81, c81, ALPHA, $f13
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#else
+   MUL c11, ALPHA, c11
+   MUL c21, ALPHA, c21
+   MUL c31, ALPHA, c31
+   MUL c41, ALPHA, c41
+   ST c11,  CO1,   0 * SIZE
+   MUL c51, ALPHA, c51
+   ST c21,  CO2,   0 * SIZE
+   MUL c61, ALPHA, c61
+   ST c31,  CO3,   0 * SIZE
+   MUL c71, ALPHA, c71
+   ST c41,  CO4,   0 * SIZE
+   MUL c81, ALPHA, c81
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   .align 3
+
+.L29:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 8
+#endif
+move   B, BO
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   C,   CO4,    LDC
+   MOV c31, c11
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   srai.d  I,  M, 1
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 4
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L35
+#else
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   LD $f10,  CO3,  -2 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   LD $f11,  CO3,  -1 * SIZE
+   MADD  c12, c12, ALPHA, $f8
+   LD $f12,  CO4,  -2 * SIZE
+   MADD  c21, c21, ALPHA, $f23
+   LD $f13,  CO4,  -1 * SIZE
+   MADD  c22, c22, ALPHA, $f9
+   MADD  c31, c31, ALPHA, $f10
+   ST c11,  CO1,  -2 * SIZE
+   MADD  c32, c32, ALPHA, $f11
+   ST c12,  CO1,  -1 * SIZE
+   MADD  c41, c41, ALPHA, $f12
+   ST c21,  CO2,  -2 * SIZE
+   MADD  c42, c42, ALPHA, $f13
+   ST c22,  CO2,  -1 * SIZE
+   ST c31,  CO3,  -2 * SIZE
+   MTC  c11, $r0
+   ST c32,  CO3,  -1 * SIZE
+   addi.d  I, I, -1
+   ST c41,  CO4,  -2 * SIZE
+   MOV c21, c11
+   ST c42,  CO4,  -1 * SIZE
+   MOV c31, c11
+#else
+   MUL c11, ALPHA, c11
+   addi.d  CO3,CO3, 2 * SIZE
+   MUL c12, ALPHA, c12
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL c21, ALPHA, c21
+   addi.d  CO4,CO4, 2 * SIZE
+   MUL c22, ALPHA, c22
+   addi.d  CO2,CO2, 2 * SIZE
+   ST c11,  CO1,  -2 * SIZE
+   MUL c31, ALPHA, c31
+   ST c12,  CO1,  -1 * SIZE
+   MUL c32, ALPHA, c32
+   ST c21,  CO2,  -2 * SIZE
+   MUL c41, ALPHA, c41
+   ST c22,  CO2,  -1 * SIZE
+   MUL c42, ALPHA, c42
+   ST c31,  CO3,  -2 * SIZE
+   MTC  c11, $r0
+   ST c32,  CO3,  -1 * SIZE
+   addi.d  I, I, -1
+   ST c41,  CO4,  -2 * SIZE
+   MOV c21, c11
+   ST c42,  CO4,  -1 * SIZE
+   MOV c31, c11
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+#endif
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 4
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO2,  0 * SIZE
+   LD $f23,  CO3,  0 * SIZE
+   LD $f9,  CO4,  0 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c21, c21, ALPHA, $f8
+   MADD  c31, c31, ALPHA, $f23
+   MADD  c41, c41, ALPHA, $f9
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#else
+   MUL c11, ALPHA, c11
+   MUL c21, ALPHA, c21
+   MUL c31, ALPHA, c31
+   MUL c41, ALPHA, c41
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   .align 3
+
+.L49:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 4
+#endif
+   move    B, BO
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+move   AO, A
+   bge $r0,    J, .L70
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   srai.d  I,  M, 1
+add.d  C,   CO2,    LDC
+   bge $r0,    I, .L60
+.L51:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 2
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L55
+#else
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  I, I, -1
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c12, c12, ALPHA, $f8
+   MADD  c21, c21, ALPHA, $f23
+   MADD  c22, c22, ALPHA, $f9
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+   ST c21,  CO2,  -2 * SIZE
+   ST c22,  CO2,  -1 * SIZE
+   blt $r0,    I, .L51
+#else
+   addi.d  I, I, -1
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   MUL c11, ALPHA, c11
+   MUL c12, ALPHA, c12
+   MUL c21, ALPHA, c21
+   MUL c22, ALPHA, c22
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+   ST c21,  CO2,  -2 * SIZE
+   ST c22,  CO2,  -1 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+   blt $r0,    I, .L51
+#endif
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 2
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L65
+#else
+   srai.d  L,  K, 2
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO2,  0 * SIZE
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c21, c21, ALPHA, $f8
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#else
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+   MUL c11, ALPHA, c11
+   MUL c21, ALPHA, c21
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   .align 3
+
+.L69:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 2
+#endif
+   move    B, BO
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+move   AO, A
+   bge $r0,    J, .L999
+   move    CO1, C
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   srai.d  I,  M, 1
+add.d  C,   CO1,    LDC
+   bge $r0,    I, .L80
+.L71:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 2
+#else
+   addi.d  TEMP, KK, 1
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L75
+#else
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   addi.d  I, I, -1
+   LD $f8,  CO1,  1 * SIZE
+   addi.d  CO1,CO1, 2 * SIZE
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+   MADD  c11, c11, ALPHA, $f22
+   MADD  c12, c12, ALPHA, $f8
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+   blt $r0,    I, .L71
+#else
+   ADD c11, c11, c21
+   addi.d  I, I, -1
+   ADD c12, c12, c22
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL c11, ALPHA, c11
+   MUL c12, ALPHA, c12
+   ST c11,  CO1,  -2 * SIZE
+   ST c12,  CO1,  -1 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -2
+#else
+   addi.d  TEMP, TEMP, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 2
+#endif
+   blt $r0,    I, .L71
+#endif
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 1
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#else
+   LD a1,  AO,   0 * SIZE
+   MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+#ifndef TRMMKERNEL
+   LD $f22,  CO1,  0 * SIZE
+   ADD c11, c11, c21
+   MADD  c11, c11, ALPHA, $f22
+   ST c11,  CO1,   0 * SIZE
+#else
+   ADD c11, c11, c21
+   MUL c11, ALPHA, c11
+   ST c11,  CO1,   0 * SIZE
+#endif
+   .align 3
+
+.L89:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 1
+#endif
+   move    B, BO
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   LDARG  $r29,  $sp,   48
+   LDARG  $r30,  $sp,   96
+   fld.d  $f24,  $sp,  56
+   fld.d  $f25,  $sp,  64
+   fld.d  $f26,  $sp,  72
+   fld.d  $f27,  $sp,  80
+   fld.d  $f28,  $sp,  88
+#if defined(TRMMKERNEL)
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#endif
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 120
+   fld.d  $f19,  $sp, 128
+   fld.d  $f20,  $sp, 136
+   fld.d  $f21,  $sp, 144
+#endif
+   addi.d  $sp, $sp, 160
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S
new file mode 100644
index 000000000..9ab43ae19
--- /dev/null
+++ b/kernel/loongarch64/gemv_n.S
@@ -0,0 +1,531 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Unused param dummy1 */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define YORIG  $r18
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define ALPHA  $f0
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define x1     $f14
+#define x2     $f15
+#define y1     $f16
+#define y2     $f17
+#define y3     $f3
+#define y4     $f1
+#define y5     $f2
+#define y6     $f4
+#define y7     $f5
+#define y8     $f6
+#define t1     $f7
+#define t2     $f18
+#define t3     $f19
+#define t4     $f20
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifdef __64BIT__
+   addi.d  $sp, $sp, -16
+#else
+   addi.d  $sp, $sp, -48
+#endif
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   slli.d     LDA,     LDA,  BASE_SHIFT
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  16
+   fst.d  $f19,  $sp,  24
+   fst.d  $f20,  $sp,  32
+#endif
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bge $r0,    N, .L999
+   li.d  I, SIZE
+   move   YORIG, Y
+   beq INCY, I, .L10
+   srai.d  I,  M, 2
+   move    YORIG, BUFFER
+   move    XX, Y
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   LD a2,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   LD a3,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   LD a4,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   ST a3,  YY,  2 * SIZE
+   ST a4,  YY,  3 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 4 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  XX,  0 * SIZE
+   add.d   XX, XX, INCY
+   ST a1,  YY,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 1 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   LD x1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD x2,  X,  0 * SIZE
+   add.d   X, X, INCX
+   move    AO1, A
+   add.d   AO2, A,      LDA
+   add.d   A,   AO2,    LDA
+   move    YY, YORIG
+   MUL x1, ALPHA, x1
+   srai.d  I,  M, 3
+   MUL    x2, ALPHA, x2
+   bge $r0,    I, .L15
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   LD y5,  YY,  4 * SIZE
+   LD a6,  AO2,  1 * SIZE
+   LD y6,  YY,  5 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD y7,  YY,  6 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   addi.d  I, I, -1
+   LD y8,  YY,  7 * SIZE
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   LD y1,  YY,   8 * SIZE
+   LD y2,  YY,   9 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   LD y3,  YY,  10 * SIZE
+   LD y4,  YY,  11 * SIZE
+   MADD  t1, a5, x2, t1
+   LD a5,  AO2,   4 * SIZE
+   MADD  t2, a6, x2, t2
+   LD a6,  AO2,   5 * SIZE
+   MADD  t3, a7, x2, t3
+   LD a7,  AO2,   6 * SIZE
+   MADD  t4, a8, x2, t4
+   LD a8,  AO2,   7 * SIZE
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   ST t3,  YY,   2 * SIZE
+   ST t4,  YY,   3 * SIZE
+   MADD  t1, a1, x1, y5
+   LD a1,  AO1,   8 * SIZE
+   MADD  t2, a2, x1, y6
+   LD a2,  AO1,   9 * SIZE
+   LD y5,  YY,  12 * SIZE
+   LD y6,  YY,  13 * SIZE
+   MADD  t3, a3, x1, y7
+   LD a3,  AO1,  10 * SIZE
+   MADD  t4, a4, x1, y8
+   LD a4,  AO1,  11 * SIZE
+   LD y7,  YY,  14 * SIZE
+   LD y8,  YY,  15 * SIZE
+   MADD  t1, a5, x2, t1
+   LD a5,  AO2,   8 * SIZE
+   MADD  t2, a6, x2, t2
+   LD a6,  AO2,   9 * SIZE
+   MADD  t3, a7, x2, t3
+   LD a7,  AO2,  10 * SIZE
+   MADD  t4, a8, x2, t4
+   LD a8,  AO2,  11 * SIZE
+   ST t1,  YY,  4 * SIZE
+   ST t2,  YY,  5 * SIZE
+   ST t3,  YY,  6 * SIZE
+   ST t4,  YY,  7 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d AO2, AO2,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   MADD  t1, a5, x2, t1
+   LD a5,  AO2,   4 * SIZE
+   MADD  t2, a6, x2, t2
+   LD a6,  AO2,   5 * SIZE
+   MADD  t3, a7, x2, t3
+   LD a7,  AO2,   6 * SIZE
+   MADD  t4, a8, x2, t4
+   LD a8,  AO2,   7 * SIZE
+   ST t1,  YY,   0 * SIZE
+   MADD  t1, a1, x1, y5
+   ST t2,  YY,   1 * SIZE
+   MADD  t2, a2, x1, y6
+   ST t3,  YY,   2 * SIZE
+   MADD  t3, a3, x1, y7
+   ST t4,  YY,   3 * SIZE
+   MADD  t4, a4, x1, y8
+   MADD  t1, a5, x2, t1
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD  t2, a6, x2, t2
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD  t3, a7, x2, t3
+   addi.d  YY,  YY,   8 * SIZE
+   MADD  t4, a8, x2, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  M, 4
+   bge $r0,    I, .L16
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   MADD  y1, a1, x1, y1
+   LD a6,  AO2,  1 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a7,  AO2,  2 * SIZE
+   MADD  y3, a3, x1, y3
+   LD a8,  AO2,  3 * SIZE
+   MADD  y4, a4, x1, y4
+   MADD  y1, a5, x2, y1
+   addi.d  YY,  YY,   4 * SIZE
+   MADD  y2, a6, x2, y2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD  y3, a7, x2, y3
+   addi.d  AO2, AO2,  4 * SIZE
+   MADD  y4, a8, x2, y4
+   ST y1,  YY,  -4 * SIZE
+   ST y2,  YY,  -3 * SIZE
+   ST y3,  YY,  -2 * SIZE
+   ST y4,  YY,  -1 * SIZE
+   .align 3
+
+.L16:
+   andi    I,  M, 2
+   bge $r0,    I, .L17
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   LD a6,  AO2,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y2, a2, x1, y2
+   addi.d  YY,  YY,   2 * SIZE
+   MADD  y1, a5, x2, y1
+   addi.d  AO1, AO1,  2 * SIZE
+   MADD  y2, a6, x2, y2
+   addi.d  AO2, AO2,  2 * SIZE
+   ST y1,  YY,  -2 * SIZE
+   ST y2,  YY,  -1 * SIZE
+   .align 3
+
+.L17:
+   andi    I,  M, 1
+   bge $r0,    I, .L19
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y1, a5, x2, y1
+   ST y1,  YY,  0 * SIZE
+   .align 3
+
+.L19:
+   addi.d  J, J, -1
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   bge $r0,    J, .L900
+   .align 3
+
+.L21:
+   LD x1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   move    YY, YORIG
+   move    AO1, A
+   srai.d  I,  M, 3
+   MUL    x1, ALPHA, x1
+   bge $r0,    I, .L25
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD y5,  YY,  4 * SIZE
+   LD y6,  YY,  5 * SIZE
+   LD y7,  YY,  6 * SIZE
+   addi.d  I, I, -1
+   LD y8,  YY,  7 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   LD y1,  YY,   8 * SIZE
+   LD y2,  YY,   9 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   LD y3,  YY,  10 * SIZE
+   LD y4,  YY,  11 * SIZE
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   ST t3,  YY,   2 * SIZE
+   ST t4,  YY,   3 * SIZE
+   MADD  t1, a1, x1, y5
+   LD a1,  AO1,   8 * SIZE
+   MADD  t2, a2, x1, y6
+   LD a2,  AO1,   9 * SIZE
+   LD y5,  YY,  12 * SIZE
+   LD y6,  YY,  13 * SIZE
+   MADD  t3, a3, x1, y7
+   LD a3,  AO1,  10 * SIZE
+   MADD  t4, a4, x1, y8
+   LD a4,  AO1,  11 * SIZE
+   LD y7,  YY,  14 * SIZE
+   LD y8,  YY,  15 * SIZE
+   ST t1,  YY,  4 * SIZE
+   ST t2,  YY,  5 * SIZE
+   ST t3,  YY,  6 * SIZE
+   ST t4,  YY,  7 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   MADD  t1, a1, x1, y1
+   LD a1,  AO1,   4 * SIZE
+   MADD  t2, a2, x1, y2
+   LD a2,  AO1,   5 * SIZE
+   MADD  t3, a3, x1, y3
+   LD a3,  AO1,   6 * SIZE
+   MADD  t4, a4, x1, y4
+   LD a4,  AO1,   7 * SIZE
+   ST t1,  YY,   0 * SIZE
+   MADD  t1, a1, x1, y5
+   ST t2,  YY,   1 * SIZE
+   MADD  t2, a2, x1, y6
+   ST t3,  YY,   2 * SIZE
+   MADD  t3, a3, x1, y7
+   ST t4,  YY,   3 * SIZE
+   MADD  t4, a4, x1, y8
+   ST t1,  YY,   4 * SIZE
+   ST t2,  YY,   5 * SIZE
+   ST t3,  YY,   6 * SIZE
+   ST t4,  YY,   7 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d  YY,  YY,   8 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 4
+   bge $r0,    I, .L26
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y2, a2, x1, y2
+   MADD  y3, a3, x1, y3
+   addi.d  YY,  YY,   4 * SIZE
+   MADD  y4, a4, x1, y4
+   addi.d  AO1, AO1,  4 * SIZE
+   ST y1,  YY,  -4 * SIZE
+   ST y2,  YY,  -3 * SIZE
+   ST y3,  YY,  -2 * SIZE
+   ST y4,  YY,  -1 * SIZE
+   .align 3
+
+.L26:
+   andi    I,  M, 2
+   bge $r0,    I, .L27
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   addi.d  YY,  YY,   2 * SIZE
+   MADD  y2, a2, x1, y2
+   addi.d  AO1, AO1,  2 * SIZE
+   ST y1,  YY,  -2 * SIZE
+   ST y2,  YY,  -1 * SIZE
+   .align 3
+
+.L27:
+   andi    I,  M, 1
+   bge $r0,    I, .L900
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   MADD  y1, a1, x1, y1
+   ST y1,  YY,  0 * SIZE
+   .align 3
+
+.L900:
+   li.d  YORIG, SIZE
+   srai.d I,  M, 2
+   beq INCY, YORIG, .L999
+   move   XX, BUFFER
+   bge $r0,    I, .L905
+   .align 3
+
+.L902:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   LD a3,  XX,  2 * SIZE
+   LD a4,  XX,  3 * SIZE
+   ST a1,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   ST a2,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   ST a4,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+   addi.d XX, XX, 4 * SIZE
+   blt $r0,    I, .L902
+   .align 3
+
+.L905:
+   andi    I,  M, 3
+   bge $r0,    I, .L999
+   .align 3
+
+.L906:
+   LD a1,  XX,  0 * SIZE
+   addi.d  XX, XX, 1 * SIZE
+   ST a1,  Y,  0 * SIZE
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L906
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  16
+   fld.d  $f19,  $sp,  24
+   fld.d  $f20,  $sp,  32
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 16
+#else
+   addi.d  $sp, $sp, 48
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S
new file mode 100644
index 000000000..af4232769
--- /dev/null
+++ b/kernel/loongarch64/gemv_t.S
@@ -0,0 +1,436 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Unused param dummy1 */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define XORIG  $r18
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define ALPHA  $f0
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define y1     $f14
+#define y2     $f15
+#define y3     $f16
+#define y4     $f17
+#define x1     $f3
+#define x2     $f1
+#define x3     $f2
+#define x4     $f4
+#define x5     $f5
+#define x6     $f6
+#define x7     $f7
+#define x8     $f18
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifdef __64BIT__
+   addi.d  $sp, $sp, -16
+#else
+   addi.d  $sp, $sp, -32
+#endif
+   MTC  y1, $r0
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   slli.d     LDA,     LDA,  BASE_SHIFT
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  16
+#endif
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bge $r0,    N, .L999
+   li.d  I, SIZE
+   move   XORIG, X
+   beq INCX, I, .L10
+   srai.d  I,  M, 2
+   move    XORIG, BUFFER
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,  0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,  0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   ST a3,  YY,  2 * SIZE
+   ST a4,  YY,  3 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 4 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  X,  0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  YY,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 1 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   move   YY, Y
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   move    AO1, A
+   MOV y2, y1
+   add.d   AO2, A,      LDA
+   MOV y3, y1
+   add.d   A,   AO2,    LDA
+   MOV y4, y1
+   srai.d  I,  M, 3
+   move   XX, XORIG
+   bge $r0,    I, .L15
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO2,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x3,  XX,  2 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x5,  XX,  4 * SIZE
+   LD a6,  AO2,  2 * SIZE
+   LD x6,  XX,  5 * SIZE
+   LD a7,  AO1,  3 * SIZE
+   LD x7,  XX,  6 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   addi.d  I, I, -1
+   LD x8,  XX,  7 * SIZE
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a2,  AO2,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   MADD  y4, a4, x2, y4
+   LD a4,  AO2,  5 * SIZE
+   LD x1,  XX,   8 * SIZE
+   LD x2,  XX,   9 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y2, a6, x3, y2
+   LD a6,  AO2,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   MADD  y4, a8, x4, y4
+   LD a8,  AO2,  7 * SIZE
+   LD x3,  XX,  10 * SIZE
+   LD x4,  XX,  11 * SIZE
+   MADD  y1, a1, x5, y1
+   LD a1,  AO1,  8 * SIZE
+   MADD  y2, a2, x5, y2
+   LD a2,  AO2,  8 * SIZE
+   MADD  y3, a3, x6, y3
+   LD a3,  AO1,  9 * SIZE
+   MADD  y4, a4, x6, y4
+   LD a4,  AO2,  9 * SIZE
+   LD x5,  XX,  12 * SIZE
+   LD x6,  XX,  13 * SIZE
+   MADD  y1, a5, x7, y1
+   LD a5,  AO1, 10 * SIZE
+   MADD  y2, a6, x7, y2
+   LD a6,  AO2, 10 * SIZE
+   MADD  y3, a7, x8, y3
+   LD a7,  AO1, 11 * SIZE
+   MADD  y4, a8, x8, y4
+   LD a8,  AO2, 11 * SIZE
+   LD x7,  XX,  14 * SIZE
+   LD x8,  XX,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d AO2, AO2,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a2,  AO2,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   MADD  y4, a4, x2, y4
+   LD a4,  AO2,  5 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y2, a6, x3, y2
+   LD a6,  AO2,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   MADD  y4, a8, x4, y4
+   LD a8,  AO2,  7 * SIZE
+   MADD  y1, a1, x5, y1
+   MADD  y2, a2, x5, y2
+   MADD  y3, a3, x6, y3
+   MADD  y4, a4, x6, y4
+   MADD  y1, a5, x7, y1
+   addi.d  XX,  XX,   8 * SIZE
+   MADD  y2, a6, x7, y2
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD  y3, a7, x8, y3
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD  y4, a8, x8, y4
+   .align 3
+
+.L15:
+   andi    I,  M, 4
+   bge $r0,    I, .L17
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO2,  0 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x3,  XX,  2 * SIZE
+   MADD  y1, a1, x1, y1
+   LD a6,  AO2,  2 * SIZE
+   MADD  y2, a2, x1, y2
+   LD a7,  AO1,  3 * SIZE
+   MADD  y3, a3, x2, y3
+   LD x4,  XX,  3 * SIZE
+   MADD  y4, a4, x2, y4
+   LD a8,  AO2,  3 * SIZE
+   MADD  y1, a5, x3, y1
+   MADD  y2, a6, x3, y2
+   addi.d  XX,  XX,   4 * SIZE
+   MADD  y3, a7, x4, y3
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD  y4, a8, x4, y4
+   addi.d  AO2, AO2,  4 * SIZE
+   .align 3
+
+.L17:
+   andi    I,  M, 3
+   ADD y1, y1, y3
+   ADD    y2, y2, y4
+   bge $r0,    I, .L19
+   .align  3
+.L18:
+   LD x1,  XX,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a2,  AO2,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX, XX, 1 * SIZE
+   addi.d  AO1, AO1,  1 * SIZE
+   addi.d  AO2, AO2,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   MADD  y2, a2, x1, y2
+   blt $r0,    I, .L18
+   .align 3
+
+.L19:
+   LD a1,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   MADD  a1, y1, ALPHA, a1
+   addi.d  J, J, -1
+   MADD  a2, y2, ALPHA, a2
+   MTC  y1, $r0
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST a2,  YY,   0 * SIZE
+   add.d  YY, YY, INCY
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   MOV y3, y1
+   move   AO1, A
+   bge $r0,    J, .L999
+   srai.d  I,  M, 3
+   move   XX, XORIG
+   bge $r0,    I, .L25
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x3,  XX,  2 * SIZE
+   LD a7,  AO1,  3 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD x5,  XX,  4 * SIZE
+   LD x6,  XX,  5 * SIZE
+   LD x7,  XX,  6 * SIZE
+   addi.d  I, I, -1
+   LD x8,  XX,  7 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   LD x1,  XX,   8 * SIZE
+   LD x2,  XX,   9 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   LD x3,  XX,  10 * SIZE
+   LD x4,  XX,  11 * SIZE
+   MADD  y1, a1, x5, y1
+   LD a1,  AO1,   8 * SIZE
+   MADD  y3, a3, x6, y3
+   LD a3,  AO1,   9 * SIZE
+   LD x5,  XX,  12 * SIZE
+   LD x6,  XX,  13 * SIZE
+   MADD  y1, a5, x7, y1
+   LD a5,  AO1,  10 * SIZE
+   MADD  y3, a7, x8, y3
+   LD a7,  AO1,  11 * SIZE
+   LD x7,  XX,  14 * SIZE
+   LD x8,  XX,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   MADD  y1, a1, x1, y1
+   LD a1,  AO1,  4 * SIZE
+   MADD  y3, a3, x2, y3
+   LD a3,  AO1,  5 * SIZE
+   MADD  y1, a5, x3, y1
+   LD a5,  AO1,  6 * SIZE
+   MADD  y3, a7, x4, y3
+   LD a7,  AO1,  7 * SIZE
+   MADD  y1, a1, x5, y1
+   MADD  y3, a3, x6, y3
+   MADD  y1, a5, x7, y1
+   MADD  y3, a7, x8, y3
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 4
+   bge $r0,    I, .L27
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a3,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x3,  XX,  2 * SIZE
+   MADD  y1, a1, x1, y1
+   LD a7,  AO1,  3 * SIZE
+   MADD  y3, a3, x2, y3
+   LD x4,  XX,  3 * SIZE
+   MADD  y1, a5, x3, y1
+   addi.d  XX,  XX,   4 * SIZE
+   MADD  y3, a7, x4, y3
+   addi.d  AO1, AO1,  4 * SIZE
+   .align 3
+
+.L27:
+   andi    I,  M, 3
+   ADD y1, y1, y3
+   bge $r0,    I, .L29
+   .align  3
+.L28:
+   LD x1,  XX,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX, XX, 1 * SIZE
+   addi.d  AO1, AO1,  1 * SIZE
+   MADD  y1, a1, x1, y1
+   blt $r0,    I, .L28
+   .align 3
+
+.L29:
+   LD a1,  Y,  0 * SIZE
+   add.d   Y, Y, INCY
+   MADD  a1, y1, ALPHA, a1
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  16
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 16
+#else
+   addi.d  $sp, $sp, 32
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S
new file mode 100644
index 000000000..31b1a9e57
--- /dev/null
+++ b/kernel/loongarch64/iamax.S
@@ -0,0 +1,233 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   li.d x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   li.d x1, 1
+   bge $r0,    N, .L999
+   FABS    s1, a1
+   add.d   X, X, INCX
+   FABS    s2, a1
+   li.d  x2, 1
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS    s4, a1
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  I, I, -1
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   FABS    t1, a5
+   addi.d  TEMP, TEMP, 4
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   addi.d  I, I, -1
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S
new file mode 100644
index 000000000..9364b9725
--- /dev/null
+++ b/kernel/loongarch64/iamin.S
@@ -0,0 +1,233 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   li.d x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   li.d x1, 1
+   bge $r0,    N, .L999
+   FABS    s1, a1
+   add.d   X, X, INCX
+   FABS    s2, a1
+   li.d  x2, 1
+   FABS    s3, a1
+   srai.d  I, N, 3
+   FABS    s4, a1
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   FABS    t3, a3
+   LD a2,  X,   0 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  I, I, -1
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   FABS    t3, a7
+   LD a6,  X,   0 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, t3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, t4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   FABS    t1, a5
+   addi.d  TEMP, TEMP, 4
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t2, s2
+   CMPLT   $fcc2, t3, s3
+   CMPLT   $fcc3, t4, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t2,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t3,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t4,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   addi.d  I, I, -1
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S
new file mode 100644
index 000000000..8d3ae529e
--- /dev/null
+++ b/kernel/loongarch64/izamax.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define t5     $f4
+#define t6     $f5
+#define t7     $f6
+#define t8     $f7
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+      LDINT   N,     0(N)
+      LDINT   INCX,  0(INCX)
+#endif
+
+   li.d x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD s1, t1, t2
+   ADD s2, t1, t2
+   ADD s3, t1, t2
+   ADD s4, t1, t2
+   addi.d  N, N, -1
+   li.d x1, 1
+   bge $r0,    N, .L999
+   add.d   X, X, INCX
+   li.d  x2, 1
+   srai.d  I, N, 2
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t3
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, s3, t5
+   add.d   X, X, INCX
+   CMPLT   $fcc3, s4, t7
+   addi.d  I, I, -1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t3
+   CMPLT   $fcc2, s3, t5
+   CMPLT   $fcc3, s4, t7
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   addi.d  I, I, -1
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S
new file mode 100644
index 000000000..38a109c21
--- /dev/null
+++ b/kernel/loongarch64/izamin.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r18
+#define TEMP   $r7
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define t5     $f4
+#define t6     $f5
+#define t7     $f6
+#define t8     $f7
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define x1     $r17
+#define x2     $r8
+#define x3     $r9
+#define x4     $r10
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   li.d x1, 0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD s1, t1, t2
+   ADD s2, t1, t2
+   ADD s3, t1, t2
+   ADD s4, t1, t2
+   addi.d  N, N, -1
+   li.d x1, 1
+   bge $r0,    N, .L999
+   add.d   X, X, INCX
+   li.d  x2, 1
+   srai.d  I, N, 2
+   li.d  x3, 1
+   li.d  TEMP, 2
+   li.d x4, 1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t3, s2
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, t5, s3
+   add.d   X, X, INCX
+   CMPLT   $fcc3, t7, s4
+   addi.d  I, I, -1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d TEMP, TEMP, 4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t3, s2
+   CMPLT   $fcc2, t5, s3
+   CMPLT   $fcc3, t7, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   CMOVT  s2,  s2,  t3,  $fcc1
+   MOVT(x2, TEMP, $fcc1)
+   CMOVT  s3,  s3,  t5,  $fcc2
+   MOVT(x3, TEMP, $fcc2)
+   CMOVT  s4,  s4,  t7,  $fcc3
+   MOVT(x4, TEMP, $fcc3)
+   addi.d  TEMP, TEMP, 4
+   addi.d  x2, x2, 1
+   addi.d  x3, x3, 2
+   addi.d  x4, x4, 3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   addi.d  I, I, -1
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   MOVT(x1, TEMP, $fcc0)
+   addi.d TEMP, TEMP, 1
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   MOVT(x1, x2, $fcc0)
+   CMOVT  s3,  s3,  s4,  $fcc1
+   MOVT(x3, x4, $fcc1)
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   MOVT(x1, x3, $fcc0)
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S
new file mode 100644
index 000000000..56c3f99a1
--- /dev/null
+++ b/kernel/loongarch64/max.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+      LDINT   N,     0(N)
+      LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD s1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   MOV    s2, s1
+   bge $r0,    N, .L999
+   MOV s3, s1
+   srai.d  I, N, 3
+   MOV    s4, s1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   CMPLT   $fcc0, s1, a1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, a2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, a3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, a4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   LD a1,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a2,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a3,  $fcc2
+   LD a2,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a4,  $fcc3
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, a5
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, a6
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, a7
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, a8
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a5,  $fcc0
+   LD a5,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a6,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a7,  $fcc2
+   LD a6,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a8,  $fcc3
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   CMPLT   $fcc0, s1, a1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, a2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, s3, a3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, s4, a4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   CMOVT  s2,  s2,  a2,  $fcc1
+   CMOVT  s3,  s3,  a3,  $fcc2
+   CMOVT  s4,  s4,  a4,  $fcc3
+   CMPLT   $fcc0, s1, a5
+   CMPLT   $fcc1, s2, a6
+   CMPLT   $fcc2, s3, a7
+   CMPLT   $fcc3, s4, a8
+   CMOVT  s1,  s1,  a5,  $fcc0
+   CMOVT  s2,  s2,  a6,  $fcc1
+   CMOVT  s3,  s3,  a7,  $fcc2
+   CMOVT  s4,  s4,  a8,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   CMPLT   $fcc0, s1, a1
+   CMOVT  s1,  s1,  a1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S
new file mode 100644
index 000000000..bb2fcfb01
--- /dev/null
+++ b/kernel/loongarch64/min.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD s1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   add.d   X, X, INCX
+   MOV    s2, s1
+   bge $r0,    N, .L999
+   MOV s3, s1
+   srai.d  I, N, 3
+   MOV    s4, s1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   CMPLT   $fcc0, a1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, a2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, a3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, a4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   LD a1,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a2,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a3,  $fcc2
+   LD a2,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a4,  $fcc3
+   add.d   X, X, INCX
+   CMPLT   $fcc0, a5, s1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, a6, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, a7, s3
+   LD a4,  X,   0 * SIZE
+   CMPLT   $fcc3, a8, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a5,  $fcc0
+   LD a5,  X,   0 * SIZE
+   CMOVT  s2,  s2,  a6,  $fcc1
+   add.d   X, X, INCX
+   CMOVT  s3,  s3,  a7,  $fcc2
+   LD a6,  X,   0 * SIZE
+   CMOVT  s4,  s4,  a8,  $fcc3
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   CMPLT   $fcc0, a1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, a2, s2
+   add.d   X, X, INCX
+   CMPLT   $fcc2, a3, s3
+   LD a8,  X,   0 * SIZE
+   CMPLT   $fcc3, a4, s4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  a1,  $fcc0
+   CMOVT  s2,  s2,  a2,  $fcc1
+   CMOVT  s3,  s3,  a3,  $fcc2
+   CMOVT  s4,  s4,  a4,  $fcc3
+   CMPLT   $fcc0, a5, s1
+   CMPLT   $fcc1, a6, s2
+   CMPLT   $fcc2, a7, s3
+   CMPLT   $fcc3, a8, s4
+   CMOVT  s1,  s1,  a5,  $fcc0
+   CMOVT  s2,  s2,  a6,  $fcc1
+   CMOVT  s3,  s3,  a7,  $fcc2
+   CMOVT  s4,  s4,  a8,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   CMPLT   $fcc0, a1, s1
+   CMOVT  s1,  s1,  a1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S
new file mode 100644
index 000000000..566bce6cb
--- /dev/null
+++ b/kernel/loongarch64/scal.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define ALPHA  $f0
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define t1     $f14
+#define t2     $f15
+#define t3     $f16
+#define t4     $f17
+
+   PROLOGUE
+
+   li.d  TEMP, SIZE
+   MTC  a1, $r0
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    N, .L999
+   CMPEQ   $fcc0, ALPHA, a1
+   bceqz   $fcc0, .L50
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   .align 3
+
+.L12:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   ST a1,  X,   2 * SIZE
+   ST a1,  X,   3 * SIZE
+   ST a1,  X,   4 * SIZE
+   ST a1,  X,   5 * SIZE
+   ST a1,  X,   6 * SIZE
+   ST a1,  X,   7 * SIZE
+   addi.w  I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   ST a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   addi.d X, X, SIZE
+   blt $r0,    I, .L16
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L20:
+   srai.d  I, N, 3
+   bge $r0,    I, .L25
+   .align 3
+
+.L22:
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   addi.d  I, I, -1
+   ST a1,  X,   0 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L26
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L50:
+   srai.d I, N, 3
+   bne INCX, TEMP, .L60
+   addi.d I, I, -1
+   blt I,  $r0, .L55
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L53
+   .align 3
+
+.L52:
+   MUL t1, ALPHA, a1
+   LD a1,  X,   8 * SIZE
+   MUL t2, ALPHA, a2
+   LD a2,  X,   9 * SIZE
+   MUL t3, ALPHA, a3
+   LD a3,  X,  10 * SIZE
+   MUL t4, ALPHA, a4
+   LD a4,  X,  11 * SIZE
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA, a5
+   LD a5,  X,  12 * SIZE
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA, a6
+   LD a6,  X,  13 * SIZE
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA, a7
+   LD a7,  X,  14 * SIZE
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA, a8
+   LD a8,  X,  15 * SIZE
+   addi.d  I, I, -1
+   ST t1,  X,   4 * SIZE
+   ST t2,  X,   5 * SIZE
+   ST t3,  X,   6 * SIZE
+   ST t4,  X,   7 * SIZE
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L52
+   .align 3
+
+.L53:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA, a5
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA, a6
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA, a7
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA, a8
+   ST t1,  X,   4 * SIZE
+   ST t2,  X,   5 * SIZE
+   ST t3,  X,   6 * SIZE
+   ST t4,  X,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   .align 3
+
+.L55:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L56:
+   LD a1,  X,   0 * SIZE
+   MUL t1, ALPHA, a1
+   addi.d  X, X, SIZE
+   addi.d  I, I, -1
+   ST t1,  X,  -1 * SIZE
+   blt $r0,    I, .L56
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L60:
+   srai.d  I, N, 3
+   move    XX, X
+   addi.d I, I, -1
+   blt I,  $r0, .L65
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   bge $r0,    I, .L63
+   .align 3
+
+.L62:
+   MUL t1, ALPHA, a1
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t2, ALPHA, a2
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t3, ALPHA, a3
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t4, ALPHA, a4
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t2, ALPHA, a6
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t3, ALPHA, a7
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   MUL t4, ALPHA, a8
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   blt $r0,    I, .L62
+   .align 3
+
+.L63:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   MUL t2, ALPHA, a6
+   MUL t3, ALPHA, a7
+   MUL t4, ALPHA, a8
+   ST t1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST t4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   .align 3
+
+.L65:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L66:
+   LD a1,  X,   0 * SIZE
+   MUL t1, ALPHA, a1
+   addi.d  I, I, -1
+   ST t1,  X,   0 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L66
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
new file mode 100644
index 000000000..57c21a017
--- /dev/null
+++ b/kernel/loongarch64/snrm2.S
@@ -0,0 +1,249 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define a5     $f16
+#define a6     $f17
+#define a7     $f0
+#define a8     $f1
+#define s1     $f22
+#define s2     $f8
+#define t1     $f23
+#define t2     $f9
+#define t3     $f10
+#define t4     $f11
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   movgr2fr.d  s1, $r0
+   li.d  TEMP, SIZE
+   fmov.d s2, s1
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    INCX, .L999
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   LD a6,  X,   5 * SIZE
+   fcvt.d.s    t2, a2
+   LD a7,  X,   6 * SIZE
+   fcvt.d.s    t3, a3
+   LD a8,  X,   7 * SIZE
+   fcvt.d.s   t4, a4
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   fmadd.d  s1, t1, t1, s1
+   LD a1,  X,   8 * SIZE
+   fcvt.d.s    t1, a5
+   NOP
+   fmadd.d  s2, t2, t2, s2
+   LD a2,  X,   9 * SIZE
+   fcvt.d.s    t2, a6
+   NOP
+   fmadd.d  s1, t3, t3, s1
+   LD a3,  X,  10 * SIZE
+   fcvt.d.s    t3, a7
+   NOP
+   fmadd.d  s2, t4, t4, s2
+   LD a4,  X,  11 * SIZE
+   fcvt.d.s    t4, a8
+   NOP
+   fmadd.d  s1, t1, t1, s1
+   LD a5,  X,  12 * SIZE
+   fcvt.d.s    t1, a1
+   NOP
+   fmadd.d  s2, t2, t2, s2
+   LD a6,  X,  13 * SIZE
+   fcvt.d.s    t2, a2
+   addi.d  I, I, -1
+   fmadd.d  s1, t3, t3, s1
+   LD a7,  X,  14 * SIZE
+   fcvt.d.s    t3, a3
+   addi.d  X, X, 8 * SIZE
+   fmadd.d  s2, t4, t4, s2
+   LD a8,  X,   7 * SIZE
+   fcvt.d.s   t4, a4
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   fmadd.d  s1, t1, t1, s1
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   fcvt.d.s    t2, a6
+   fmadd.d  s1, t3, t3, s1
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   fcvt.d.s    t4, a8
+   fmadd.d  s1, t1, t1, s1
+   fmadd.d  s2, t2, t2, s2
+   fmadd.d  s1, t3, t3, s1
+   fmadd.d  s2, t4, t4, s2
+   addi.d  X, X, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   fmadd.d  s1, t1, t1, s1
+   addi.d X, X, SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD a8,  X,   0 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   fcvt.d.s    t2, a2
+   fcvt.d.s    t3, a3
+   fcvt.d.s    t4, a4
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   fmadd.d  s1, t1, t1, s1
+   LD a1,  X,   0 * SIZE
+   fcvt.d.s    t1, a5
+   add.d   X, X, INCX
+   fmadd.d  s2, t2, t2, s2
+   LD a2,  X,   0 * SIZE
+   fcvt.d.s    t2, a6
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a3,  X,   0 * SIZE
+   fcvt.d.s    t3, a7
+   add.d   X, X, INCX
+   fmadd.d  s2, t4, t4, s2
+   LD a4,  X,   0 * SIZE
+   fcvt.d.s    t4, a8
+   add.d   X, X, INCX
+   fmadd.d  s1, t1, t1, s1
+   LD a5,  X,   0 * SIZE
+   fcvt.d.s    t1, a1
+   add.d   X, X, INCX
+   fmadd.d  s2, t2, t2, s2
+   LD a6,  X,   0 * SIZE
+   fcvt.d.s    t2, a2
+   add.d   X, X, INCX
+   fmadd.d  s1, t3, t3, s1
+   LD a7,  X,   0 * SIZE
+   fcvt.d.s    t3, a3
+   add.d   X, X, INCX
+   fmadd.d  s2, t4, t4, s2
+   LD a8,  X,   0 * SIZE
+   fcvt.d.s    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   fmadd.d  s1, t1, t1, s1
+   fcvt.d.s    t1, a5
+   fmadd.d  s2, t2, t2, s2
+   fcvt.d.s    t2, a6
+   fmadd.d  s1, t3, t3, s1
+   fcvt.d.s    t3, a7
+   fmadd.d  s2, t4, t4, s2
+   fcvt.d.s    t4, a8
+   fmadd.d  s1, t1, t1, s1
+   fmadd.d  s2, t2, t2, s2
+   fmadd.d  s1, t3, t3, s1
+   fmadd.d  s2, t4, t4, s2
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   fcvt.d.s    t1, a1
+   add.d   X, X, INCX
+   fmadd.d  s1, t1, t1, s1
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   fadd.d  s1, s1, s2
+   fsqrt.d s1, s1
+   move $r4, $r17
+   fcvt.s.d    $f0, s1
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S
new file mode 100644
index 000000000..4578a8d54
--- /dev/null
+++ b/kernel/loongarch64/swap.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+#define b1     $f14
+#define b2     $f15
+#define b3     $f16
+#define b4     $f17
+#define b5     $f0
+#define b6     $f1
+#define b7     $f2
+#define b8     $f3
+
+   PROLOGUE
+
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   bge $r0,    N, .L999
+   slli.d INCY, INCY, BASE_SHIFT
+   bne INCX, TEMP, .L20
+   srai.d I, N, 3
+   bne INCY, TEMP, .L20
+   addi.d I, I, -1
+   blt I,  $r0, .L15
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b2,  Y,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD b3,  Y,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD b4,  Y,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD b5,  Y,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD b6,  Y,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD b7,  Y,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   LD b8,  Y,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   8 * SIZE
+   ST b1,  X,   0 * SIZE
+   LD b1,  Y,   8 * SIZE
+   ST a2,  Y,   1 * SIZE
+   LD a2,  X,   9 * SIZE
+   ST b2,  X,   1 * SIZE
+   LD b2,  Y,   9 * SIZE
+   ST a3,  Y,   2 * SIZE
+   LD a3,  X,  10 * SIZE
+   ST b3,  X,   2 * SIZE
+   LD b3,  Y,  10 * SIZE
+   ST a4,  Y,   3 * SIZE
+   LD a4,  X,  11 * SIZE
+   ST b4,  X,   3 * SIZE
+   LD b4,  Y,  11 * SIZE
+   ST a5,  Y,   4 * SIZE
+   LD a5,  X,  12 * SIZE
+   ST b5,  X,   4 * SIZE
+   LD b5,  Y,  12 * SIZE
+   ST a6,  Y,   5 * SIZE
+   LD a6,  X,  13 * SIZE
+   ST b6,  X,   5 * SIZE
+   LD b6,  Y,  13 * SIZE
+   ST a7,  Y,   6 * SIZE
+   LD a7,  X,  14 * SIZE
+   ST b7,  X,   6 * SIZE
+   LD b7,  Y,  14 * SIZE
+   ST a8,  Y,   7 * SIZE
+   LD a8,  X,  15 * SIZE
+   ST b8,  X,   7 * SIZE
+   LD b8,  Y,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST a1,  Y,   0 * SIZE
+   ST b1,  X,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   ST b2,  X,   1 * SIZE
+   ST a3,  Y,   2 * SIZE
+   ST b3,  X,   2 * SIZE
+   ST a4,  Y,   3 * SIZE
+   ST b4,  X,   3 * SIZE
+   ST a5,  Y,   4 * SIZE
+   ST b5,  X,   4 * SIZE
+   ST a6,  Y,   5 * SIZE
+   ST b6,  X,   5 * SIZE
+   ST a7,  Y,   6 * SIZE
+   ST b7,  X,   6 * SIZE
+   ST a8,  Y,   7 * SIZE
+   ST b8,  X,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   addi.d  Y, Y, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  X, X, SIZE
+   addi.d  I, I, -1
+   addi.d  Y, Y, SIZE
+   ST b1,  X,  -1 * SIZE
+   ST a1,  Y,  -1 * SIZE
+   blt $r0,    I, .L16
+   b   .L999
+   .align 3
+
+.L20:
+   srai.d  I, N, 3
+   move    XX, X
+   move    YY, Y
+   addi.d I, I, -1
+   blt I,  $r0, .L25
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   LD b8,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   bge $r0,    I, .L23
+   .align 3
+
+.L22:
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b1,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a2,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a2,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b2,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a3,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b3,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a4,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a4,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b4,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a5,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b5,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b5,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a6,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a6,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b6,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b6,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a7,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b7,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b7,  Y,   0 * SIZE
+   add.d   Y, Y, INCY
+   ST a8,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   LD a8,  X,   0 * SIZE
+   add.d   X, X, INCX
+   ST b8,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   LD b8,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST a1,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b1,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a2,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b2,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a3,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b3,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a4,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b4,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a5,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b5,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a6,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b6,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a7,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b7,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   ST a8,  YY,   0 * SIZE
+   add.d   YY, YY, INCY
+   ST b8,  XX,   0 * SIZE
+   add.d   XX, XX, INCX
+   .align 3
+
+.L25:
+   andi    I,  N, 7
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   ST a1,  Y,   0 * SIZE
+   ST b1,  X,   0 * SIZE
+   add.d   X, X, INCX
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S
new file mode 100644
index 000000000..a0bd29f3b
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_LN.S
@@ -0,0 +1,2863 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r29
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define KK     $r30
+#define TEMP   $r20
+#define AORIG  $r16
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -144
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   SDARG  $r29,  $sp,   88
+   SDARG  $r30,  $sp,   96
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 112
+   fst.d  $f19,  $sp, 120
+   fst.d  $f20,  $sp, 128
+   fst.d  $f21,  $sp, 136
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, BASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+   neg KK, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L30
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 3
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO8,    LDC
+#endif
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L20
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  0 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   MOV c81, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c81, c11, b8, c81
+   LD b2,  BO,   9 * SIZE
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c81, c21, b8, c81
+   LD b3,  BO,  18 * SIZE
+   LD b4,  BO,  19 * SIZE
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c81, c31, b8, c81
+   LD b4,  BO,  27 * SIZE
+   LD b5,  BO,  28 * SIZE
+   LD b6,  BO,  29 * SIZE
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL c41, b4, c41
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c81, c41, b8, c81
+   LD b5,  BO,  36 * SIZE
+   LD b6,  BO,  37 * SIZE
+   LD b7,  BO,  38 * SIZE
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c81, c51, b8, c81
+   LD b6,  BO,  45 * SIZE
+   LD b7,  BO,  46 * SIZE
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c81, c61, b8, c81
+   LD b7,  BO,  54 * SIZE
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   NMSUB  c81, c71, b8, c81
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   LD b5,  BO,  59 * SIZE
+   LD b6,  BO,  58 * SIZE
+   LD b7,  BO,  57 * SIZE
+   LD b8,  BO,  56 * SIZE
+   MUL c81, b1, c81
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c11, c81, b8, c11
+   LD b2,  BO,  54 * SIZE
+   LD b3,  BO,  53 * SIZE
+   LD b4,  BO,  52 * SIZE
+   LD b5,  BO,  51 * SIZE
+   LD b6,  BO,  50 * SIZE
+   LD b7,  BO,  49 * SIZE
+   LD b8,  BO,  48 * SIZE
+   MUL c71, b2, c71
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c11, c71, b8, c11
+   LD b3,  BO,  45 * SIZE
+   LD b4,  BO,  44 * SIZE
+   LD b5,  BO,  43 * SIZE
+   LD b6,  BO,  42 * SIZE
+   LD b7,  BO,  41 * SIZE
+   LD b8,  BO,  40 * SIZE
+   MUL c61, b3, c61
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c11, c61, b8, c11
+   LD b4,  BO,  36 * SIZE
+   LD b5,  BO,  35 * SIZE
+   LD b6,  BO,  34 * SIZE
+   LD b7,  BO,  33 * SIZE
+   LD b8,  BO,  32 * SIZE
+   MUL c51, b4, c51
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c11, c51, b8, c11
+   LD b5,  BO,  27 * SIZE
+   LD b6,  BO,  26 * SIZE
+   LD b7,  BO,  25 * SIZE
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  18 * SIZE
+   LD b7,  BO,  17 * SIZE
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+   addi.d  CO5, CO5, -1 * SIZE
+   addi.d  CO6, CO6, -1 * SIZE
+   addi.d  CO7, CO7, -1 * SIZE
+   addi.d  CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c61,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c81,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+MTC  c11, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+   addi.d  CO5, CO5, 1 * SIZE
+   addi.d  CO6, CO6, 1 * SIZE
+   addi.d  CO7, CO7, 1 * SIZE
+   addi.d  CO8, CO8, 1 * SIZE
+#endif
+   MOV c21, c11
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, c11
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+   MOV c41, c11
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L20:
+   srai.d  I,  M, 1
+   MOV c51, c11
+MOV    c61, c11
+   bge $r0,    I, .L29
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  BO,   4 * SIZE
+   SUB c21, b2, c21
+   LD b6,  BO,   5 * SIZE
+   SUB c31, b3, c31
+   LD b7,  BO,   6 * SIZE
+   SUB c41, b4, c41
+   LD b8,  BO,   7 * SIZE
+   SUB c51, b5, c51
+   LD b1,  BO,   8 * SIZE
+   SUB c61, b6, c61
+   LD b2,  BO,   9 * SIZE
+   SUB c71, b7, c71
+   LD b3,  BO,  10 * SIZE
+   SUB c81, b8, c81
+   LD b4,  BO,  11 * SIZE
+   SUB c12, b1, c12
+   LD b5,  BO,  12 * SIZE
+   SUB c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   SUB c32, b3, c32
+   LD b7,  BO,  14 * SIZE
+   SUB c42, b4, c42
+   LD b8,  BO,  15 * SIZE
+   SUB c52, b5, c52
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+#else
+   LD b1,  AO,   0 * SIZE
+#endif
+   SUB c62, b6, c62
+   SUB c72, b7, c72
+   SUB c82, b8, c82
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  AO,   4 * SIZE
+   SUB c12, b2, c12
+   LD b6,  AO,   5 * SIZE
+   SUB c21, b3, c21
+   LD b7,  AO,   6 * SIZE
+   SUB c22, b4, c22
+   LD b8,  AO,   7 * SIZE
+   SUB c31, b5, c31
+   LD b1,  AO,   8 * SIZE
+   SUB c32, b6, c32
+   LD b2,  AO,   9 * SIZE
+   SUB c41, b7, c41
+   LD b3,  AO,  10 * SIZE
+   SUB c42, b8, c42
+   LD b4,  AO,  11 * SIZE
+   LD b5,  AO,  12 * SIZE
+   SUB c51, b1, c51
+   LD b6,  AO,  13 * SIZE
+   SUB c52, b2, c52
+   LD b7,  AO,  14 * SIZE
+   SUB c61, b3, c61
+   LD b8,  AO,  15 * SIZE
+   SUB c62, b4, c62
+   SUB c71, b5, c71
+   SUB c72, b6, c72
+   SUB c81, b7, c81
+   SUB c82, b8, c82
+#endif
+#ifdef LN
+   MUL c12, b1, c12
+   LD b2,  AO,   2 * SIZE
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   MUL c52, b1, c52
+   MUL c62, b1, c62
+   MUL c72, b1, c72
+   MUL c82, b1, c82
+   NMSUB  c11, c12, b2, c11
+   LD b3,  AO,   0 * SIZE
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   NMSUB  c51, c52, b2, c51
+   NMSUB  c61, c62, b2, c61
+   NMSUB  c71, c72, b2, c71
+   NMSUB  c81, c82, b2, c81
+   MUL c11, b3, c11
+   addi.d  CO1, CO1, -2 * SIZE
+   MUL c21, b3, c21
+   addi.d  CO2, CO2, -2 * SIZE
+   MUL c31, b3, c31
+   addi.d  CO3, CO3, -2 * SIZE
+   MUL c41, b3, c41
+   addi.d  CO4, CO4, -2 * SIZE
+   MUL c51, b3, c51
+   addi.d  CO5, CO5, -2 * SIZE
+   MUL c61, b3, c61
+   addi.d  CO6, CO6, -2 * SIZE
+   MUL c71, b3, c71
+   addi.d  CO7, CO7, -2 * SIZE
+   MUL c81, b3, c81
+   addi.d  CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+   MUL c11, b1, c11
+   LD b2,  AO,   1 * SIZE
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+   NMSUB  c12, c11, b2, c12
+   LD b3,  AO,   3 * SIZE
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   NMSUB  c52, c51, b2, c52
+   NMSUB  c62, c61, b2, c62
+   NMSUB  c72, c71, b2, c72
+   NMSUB  c82, c81, b2, c82
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+   MUL c52, b3, c52
+   MUL c62, b3, c62
+   MUL c72, b3, c72
+   MUL c82, b3, c82
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   LD b5,  BO,   4 * SIZE
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   LD b6,  BO,   5 * SIZE
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   LD b7,  BO,   6 * SIZE
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b8,  BO,   7 * SIZE
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c52, c12, b5, c52
+   LD b2,  BO,   9 * SIZE
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c62, c12, b6, c62
+   LD b3,  BO,  10 * SIZE
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c72, c12, b7, c72
+   LD b4,  BO,  11 * SIZE
+   NMSUB  c81, c11, b8, c81
+   NMSUB  c82, c12, b8, c82
+   LD b5,  BO,  12 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   LD b7,  BO,  14 * SIZE
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b8,  BO,  15 * SIZE
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c52, c22, b5, c52
+   LD b3,  BO,  18 * SIZE
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c62, c22, b6, c62
+   LD b4,  BO,  19 * SIZE
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c72, c22, b7, c72
+   LD b5,  BO,  20 * SIZE
+   NMSUB  c81, c21, b8, c81
+   NMSUB  c82, c22, b8, c82
+   LD b6,  BO,  21 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   LD b7,  BO,  22 * SIZE
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b8,  BO,  23 * SIZE
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c52, c32, b5, c52
+   LD b4,  BO,  27 * SIZE
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c62, c32, b6, c62
+   LD b5,  BO,  28 * SIZE
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c72, c32, b7, c72
+   LD b6,  BO,  29 * SIZE
+   NMSUB  c81, c31, b8, c81
+   NMSUB  c82, c32, b8, c82
+   LD b7,  BO,  30 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+   LD b8,  BO,  31 * SIZE
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c52, c42, b5, c52
+   LD b5,  BO,  36 * SIZE
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c62, c42, b6, c62
+   LD b6,  BO,  37 * SIZE
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c72, c42, b7, c72
+   LD b7,  BO,  38 * SIZE
+   NMSUB  c81, c41, b8, c81
+   NMSUB  c82, c42, b8, c82
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   MUL c52, b5, c52
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c62, c52, b6, c62
+   LD b6,  BO,  45 * SIZE
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  46 * SIZE
+   NMSUB  c81, c51, b8, c81
+   NMSUB  c82, c52, b8, c82
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   MUL c62, b6, c62
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c72, c62, b7, c72
+   LD b7,  BO,  54 * SIZE
+   NMSUB  c81, c61, b8, c81
+   NMSUB  c82, c62, b8, c82
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   MUL c72, b7, c72
+   NMSUB  c81, c71, b8, c81
+   NMSUB  c82, c72, b8, c82
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+   MUL c82, b8, c82
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   MUL c81, b1, c81
+   MUL c82, b1, c82
+   LD b5,  BO,  59 * SIZE
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c72, c82, b2, c72
+   LD b6,  BO,  58 * SIZE
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c62, c82, b3, c62
+   LD b7,  BO,  57 * SIZE
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c52, c82, b4, c52
+   LD b8,  BO,  56 * SIZE
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c42, c82, b5, c42
+   LD b2,  BO,  54 * SIZE
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c32, c82, b6, c32
+   LD b3,  BO,  53 * SIZE
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c22, c82, b7, c22
+   LD b4,  BO,  52 * SIZE
+   NMSUB  c11, c81, b8, c11
+   NMSUB  c12, c82, b8, c12
+   LD b5,  BO,  51 * SIZE
+   MUL c71, b2, c71
+   MUL c72, b2, c72
+   LD b6,  BO,  50 * SIZE
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c62, c72, b3, c62
+   LD b7,  BO,  49 * SIZE
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c52, c72, b4, c52
+   LD b8,  BO,  48 * SIZE
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c42, c72, b5, c42
+   LD b3,  BO,  45 * SIZE
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c32, c72, b6, c32
+   LD b4,  BO,  44 * SIZE
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c22, c72, b7, c22
+   LD b5,  BO,  43 * SIZE
+   NMSUB  c11, c71, b8, c11
+   NMSUB  c12, c72, b8, c12
+   LD b6,  BO,  42 * SIZE
+   MUL c61, b3, c61
+   MUL c62, b3, c62
+   LD b7,  BO,  41 * SIZE
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c52, c62, b4, c52
+   LD b8,  BO,  40 * SIZE
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c42, c62, b5, c42
+   LD b4,  BO,  36 * SIZE
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c32, c62, b6, c32
+   LD b5,  BO,  35 * SIZE
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c22, c62, b7, c22
+   LD b6,  BO,  34 * SIZE
+   NMSUB  c11, c61, b8, c11
+   NMSUB  c12, c62, b8, c12
+   LD b7,  BO,  33 * SIZE
+   MUL c51, b4, c51
+   MUL c52, b4, c52
+   LD b8,  BO,  32 * SIZE
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c42, c52, b5, c42
+   LD b5,  BO,  27 * SIZE
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c32, c52, b6, c32
+   LD b6,  BO,  26 * SIZE
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c22, c52, b7, c22
+   LD b7,  BO,  25 * SIZE
+   NMSUB  c11, c51, b8, c11
+   NMSUB  c12, c52, b8, c12
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   LD b6,  BO,  18 * SIZE
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   LD b7,  BO,  17 * SIZE
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   LD b7,  BO,   9 * SIZE
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+   ST c12,  BO,   8 * SIZE
+   ST c22,  BO,   9 * SIZE
+   ST c32,  BO,  10 * SIZE
+   ST c42,  BO,  11 * SIZE
+   ST c52,  BO,  12 * SIZE
+   ST c62,  BO,  13 * SIZE
+   ST c72,  BO,  14 * SIZE
+   ST c82,  BO,  15 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+   ST c51,  AO,   8 * SIZE
+   ST c52,  AO,   9 * SIZE
+   ST c61,  AO,  10 * SIZE
+   ST c62,  AO,  11 * SIZE
+   ST c71,  AO,  12 * SIZE
+   ST c72,  AO,  13 * SIZE
+   ST c81,  AO,  14 * SIZE
+   ST c82,  AO,  15 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c52,  CO5,   1 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c62,  CO6,   1 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c72,  CO7,   1 * SIZE
+   ST c81,  CO8,   0 * SIZE
+   ST c82,  CO8,   1 * SIZE
+MTC  a1, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+   addi.d  CO5, CO5, 2 * SIZE
+   addi.d  CO6, CO6, 2 * SIZE
+   addi.d  CO7, CO7, 2 * SIZE
+   addi.d  CO8, CO8, 2 * SIZE
+#endif
+   MOV c11, a1
+   MOV c21, a1
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, a1
+   MOV c41, a1
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   MOV c51, a1
+MOV    c61, a1
+   blt $r0,    I, .L11
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  8
+#endif
+#ifdef RT
+   addi.d  KK, KK, -8
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+#ifdef RT
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   MOV c21, c11
+   add.d   CO4, CO3,    LDC
+   MOV c31, c11
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+   andi    I,  M, 1
+MOV    c41, c11
+   bge $r0,    I, .L40
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+MTC  c11, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+#endif
+   MOV c21, c11
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+   MOV c31, c11
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L40:
+   srai.d  I,  M, 1
+   MOV c61, c11
+MOV    c41, c11
+   bge $r0,    I, .L49
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c12, b5, c12
+   SUB c22, b6, c22
+   SUB c32, b7, c32
+   SUB c42, b8, c42
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+   SUB c31, b5, c31
+   SUB c32, b6, c32
+   SUB c41, b7, c41
+   SUB c42, b8, c42
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+   MUL c31, b3, c31
+   MUL c41, b3, c41
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+   addi.d  CO3, CO3, -2 * SIZE
+   addi.d  CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c12,  BO,   4 * SIZE
+   ST c22,  BO,   5 * SIZE
+   ST c32,  BO,   6 * SIZE
+   ST c42,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L49:
+#ifdef LN
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+#else
+   move    AO, A
+#endif
+   bge $r0,    J, .L70
+#ifdef RT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+   move    AO, A
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   andi    I,  M, 1
+   bge $r0,    I, .L60
+#if defined(LT) || defined(RN)
+   srai.d  L,  KK, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   srai.d  L,  TEMP, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+   LD b3,  AO,   0 * SIZE
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   MUL c21, b3, c21
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   NMSUB  c11, c21, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 0 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L60:
+   srai.d  I,  M, 1
+   bge $r0,    I, .L69
+.L51:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c12, b3, c12
+   SUB c22, b4, c22
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   MUL c21, b3, c21
+   MUL c22, b3, c22
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   MUL c22, b1, c22
+   NMSUB  c11, c21, b2, c11
+   NMSUB  c12, c22, b2, c12
+   MUL c11, b3, c11
+   MUL c12, b3, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c12,  BO,   2 * SIZE
+   ST c22,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L51
+   .align 3
+
+.L69:
+#ifdef LN
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+   bge $r0,    J, .L999
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+   move    AO, A
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   andi    I,  M, 1
+   bge $r0,    I, .L80
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   SUB c11, b1, c11
+#else
+   LD b1,  AO,   0 * SIZE
+   SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L80:
+   srai.d  I,  M, 1
+   bge $r0,    I, .L89
+.L71:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   NMSUB  c11, c12, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c12, c11, b2, c12
+   MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L71
+   .align 3
+
+.L89:
+#ifdef LN
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   LDARG  $r29,  $sp,   88
+   LDARG  $r30,  $sp,   96
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 112
+   fld.d  $f19,  $sp, 120
+   fld.d  $f20,  $sp, 128
+   fld.d  $f21,  $sp, 136
+#endif
+   addi.d  $sp, $sp, 144
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S
new file mode 100644
index 000000000..aa6822c32
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_LT.S
@@ -0,0 +1,2854 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r29
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define KK     $r30
+#define TEMP   $r20
+#define AORIG  $r16
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -144
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   SDARG  $r29,  $sp,   88
+   SDARG  $r30,  $sp,   96
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 112
+   fst.d  $f19,  $sp, 120
+   fst.d  $f20,  $sp, 128
+   fst.d  $f21,  $sp, 136
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, BASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L30
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 3
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO8,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  TEMP, 2
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  BO,   4 * SIZE
+   SUB c21, b2, c21
+   LD b6,  BO,   5 * SIZE
+   SUB c31, b3, c31
+   LD b7,  BO,   6 * SIZE
+   SUB c41, b4, c41
+   LD b8,  BO,   7 * SIZE
+   SUB c51, b5, c51
+   LD b1,  BO,   8 * SIZE
+   SUB c61, b6, c61
+   LD b2,  BO,   9 * SIZE
+   SUB c71, b7, c71
+   LD b3,  BO,  10 * SIZE
+   SUB c81, b8, c81
+   LD b4,  BO,  11 * SIZE
+   SUB c12, b1, c12
+   LD b5,  BO,  12 * SIZE
+   SUB c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   SUB c32, b3, c32
+   LD b7,  BO,  14 * SIZE
+   SUB c42, b4, c42
+   LD b8,  BO,  15 * SIZE
+   SUB c52, b5, c52
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+#else
+   LD b1,  AO,   0 * SIZE
+#endif
+   SUB c62, b6, c62
+   SUB c72, b7, c72
+   SUB c82, b8, c82
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  AO,   4 * SIZE
+   SUB c12, b2, c12
+   LD b6,  AO,   5 * SIZE
+   SUB c21, b3, c21
+   LD b7,  AO,   6 * SIZE
+   SUB c22, b4, c22
+   LD b8,  AO,   7 * SIZE
+   SUB c31, b5, c31
+   LD b1,  AO,   8 * SIZE
+   SUB c32, b6, c32
+   LD b2,  AO,   9 * SIZE
+   SUB c41, b7, c41
+   LD b3,  AO,  10 * SIZE
+   SUB c42, b8, c42
+   LD b4,  AO,  11 * SIZE
+   LD b5,  AO,  12 * SIZE
+   SUB c51, b1, c51
+   LD b6,  AO,  13 * SIZE
+   SUB c52, b2, c52
+   LD b7,  AO,  14 * SIZE
+   SUB c61, b3, c61
+   LD b8,  AO,  15 * SIZE
+   SUB c62, b4, c62
+   SUB c71, b5, c71
+   SUB c72, b6, c72
+   SUB c81, b7, c81
+   SUB c82, b8, c82
+#endif
+#ifdef LN
+   MUL c12, b1, c12
+   LD b2,  AO,   2 * SIZE
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   MUL c52, b1, c52
+   MUL c62, b1, c62
+   MUL c72, b1, c72
+   MUL c82, b1, c82
+   NMSUB  c11, c12, b2, c11
+   LD b3,  AO,   0 * SIZE
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   NMSUB  c51, c52, b2, c51
+   NMSUB  c61, c62, b2, c61
+   NMSUB  c71, c72, b2, c71
+   NMSUB  c81, c82, b2, c81
+   MUL c11, b3, c11
+   addi.d  CO1, CO1, -2 * SIZE
+   MUL c21, b3, c21
+   addi.d  CO2, CO2, -2 * SIZE
+   MUL c31, b3, c31
+   addi.d  CO3, CO3, -2 * SIZE
+   MUL c41, b3, c41
+   addi.d  CO4, CO4, -2 * SIZE
+   MUL c51, b3, c51
+   addi.d  CO5, CO5, -2 * SIZE
+   MUL c61, b3, c61
+   addi.d  CO6, CO6, -2 * SIZE
+   MUL c71, b3, c71
+   addi.d  CO7, CO7, -2 * SIZE
+   MUL c81, b3, c81
+   addi.d  CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+   MUL c11, b1, c11
+   LD b2,  AO,   1 * SIZE
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+   NMSUB  c12, c11, b2, c12
+   LD b3,  AO,   3 * SIZE
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   NMSUB  c52, c51, b2, c52
+   NMSUB  c62, c61, b2, c62
+   NMSUB  c72, c71, b2, c72
+   NMSUB  c82, c81, b2, c82
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+   MUL c52, b3, c52
+   MUL c62, b3, c62
+   MUL c72, b3, c72
+   MUL c82, b3, c82
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   LD b5,  BO,   4 * SIZE
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   LD b6,  BO,   5 * SIZE
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   LD b7,  BO,   6 * SIZE
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b8,  BO,   7 * SIZE
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c52, c12, b5, c52
+   LD b2,  BO,   9 * SIZE
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c62, c12, b6, c62
+   LD b3,  BO,  10 * SIZE
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c72, c12, b7, c72
+   LD b4,  BO,  11 * SIZE
+   NMSUB  c81, c11, b8, c81
+   NMSUB  c82, c12, b8, c82
+   LD b5,  BO,  12 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   LD b7,  BO,  14 * SIZE
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b8,  BO,  15 * SIZE
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c52, c22, b5, c52
+   LD b3,  BO,  18 * SIZE
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c62, c22, b6, c62
+   LD b4,  BO,  19 * SIZE
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c72, c22, b7, c72
+   LD b5,  BO,  20 * SIZE
+   NMSUB  c81, c21, b8, c81
+   NMSUB  c82, c22, b8, c82
+   LD b6,  BO,  21 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   LD b7,  BO,  22 * SIZE
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b8,  BO,  23 * SIZE
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c52, c32, b5, c52
+   LD b4,  BO,  27 * SIZE
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c62, c32, b6, c62
+   LD b5,  BO,  28 * SIZE
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c72, c32, b7, c72
+   LD b6,  BO,  29 * SIZE
+   NMSUB  c81, c31, b8, c81
+   NMSUB  c82, c32, b8, c82
+   LD b7,  BO,  30 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+   LD b8,  BO,  31 * SIZE
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c52, c42, b5, c52
+   LD b5,  BO,  36 * SIZE
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c62, c42, b6, c62
+   LD b6,  BO,  37 * SIZE
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c72, c42, b7, c72
+   LD b7,  BO,  38 * SIZE
+   NMSUB  c81, c41, b8, c81
+   NMSUB  c82, c42, b8, c82
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   MUL c52, b5, c52
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c62, c52, b6, c62
+   LD b6,  BO,  45 * SIZE
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  46 * SIZE
+   NMSUB  c81, c51, b8, c81
+   NMSUB  c82, c52, b8, c82
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   MUL c62, b6, c62
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c72, c62, b7, c72
+   LD b7,  BO,  54 * SIZE
+   NMSUB  c81, c61, b8, c81
+   NMSUB  c82, c62, b8, c82
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   MUL c72, b7, c72
+   NMSUB  c81, c71, b8, c81
+   NMSUB  c82, c72, b8, c82
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+   MUL c82, b8, c82
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   MUL c81, b1, c81
+   MUL c82, b1, c82
+   LD b5,  BO,  59 * SIZE
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c72, c82, b2, c72
+   LD b6,  BO,  58 * SIZE
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c62, c82, b3, c62
+   LD b7,  BO,  57 * SIZE
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c52, c82, b4, c52
+   LD b8,  BO,  56 * SIZE
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c42, c82, b5, c42
+   LD b2,  BO,  54 * SIZE
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c32, c82, b6, c32
+   LD b3,  BO,  53 * SIZE
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c22, c82, b7, c22
+   LD b4,  BO,  52 * SIZE
+   NMSUB  c11, c81, b8, c11
+   NMSUB  c12, c82, b8, c12
+   LD b5,  BO,  51 * SIZE
+   MUL c71, b2, c71
+   MUL c72, b2, c72
+   LD b6,  BO,  50 * SIZE
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c62, c72, b3, c62
+   LD b7,  BO,  49 * SIZE
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c52, c72, b4, c52
+   LD b8,  BO,  48 * SIZE
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c42, c72, b5, c42
+   LD b3,  BO,  45 * SIZE
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c32, c72, b6, c32
+   LD b4,  BO,  44 * SIZE
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c22, c72, b7, c22
+   LD b5,  BO,  43 * SIZE
+   NMSUB  c11, c71, b8, c11
+   NMSUB  c12, c72, b8, c12
+   LD b6,  BO,  42 * SIZE
+   MUL c61, b3, c61
+   MUL c62, b3, c62
+   LD b7,  BO,  41 * SIZE
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c52, c62, b4, c52
+   LD b8,  BO,  40 * SIZE
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c42, c62, b5, c42
+   LD b4,  BO,  36 * SIZE
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c32, c62, b6, c32
+   LD b5,  BO,  35 * SIZE
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c22, c62, b7, c22
+   LD b6,  BO,  34 * SIZE
+   NMSUB  c11, c61, b8, c11
+   NMSUB  c12, c62, b8, c12
+   LD b7,  BO,  33 * SIZE
+   MUL c51, b4, c51
+   MUL c52, b4, c52
+   LD b8,  BO,  32 * SIZE
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c42, c52, b5, c42
+   LD b5,  BO,  27 * SIZE
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c32, c52, b6, c32
+   LD b6,  BO,  26 * SIZE
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c22, c52, b7, c22
+   LD b7,  BO,  25 * SIZE
+   NMSUB  c11, c51, b8, c11
+   NMSUB  c12, c52, b8, c12
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   LD b6,  BO,  18 * SIZE
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   LD b7,  BO,  17 * SIZE
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   LD b7,  BO,   9 * SIZE
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+   ST c12,  BO,   8 * SIZE
+   ST c22,  BO,   9 * SIZE
+   ST c32,  BO,  10 * SIZE
+   ST c42,  BO,  11 * SIZE
+   ST c52,  BO,  12 * SIZE
+   ST c62,  BO,  13 * SIZE
+   ST c72,  BO,  14 * SIZE
+   ST c82,  BO,  15 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+   ST c51,  AO,   8 * SIZE
+   ST c52,  AO,   9 * SIZE
+   ST c61,  AO,  10 * SIZE
+   ST c62,  AO,  11 * SIZE
+   ST c71,  AO,  12 * SIZE
+   ST c72,  AO,  13 * SIZE
+   ST c81,  AO,  14 * SIZE
+   ST c82,  AO,  15 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c52,  CO5,   1 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c62,  CO6,   1 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c72,  CO7,   1 * SIZE
+   ST c81,  CO8,   0 * SIZE
+   ST c82,  CO8,   1 * SIZE
+MTC  a1, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+   addi.d  CO5, CO5, 2 * SIZE
+   addi.d  CO6, CO6, 2 * SIZE
+   addi.d  CO7, CO7, 2 * SIZE
+   addi.d  CO8, CO8, 2 * SIZE
+#endif
+   MOV c11, a1
+   MOV c21, a1
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, a1
+   MOV c41, a1
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   MOV c51, a1
+MOV    c61, a1
+   blt $r0,    I, .L11
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  0 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   MOV c81, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c81, c11, b8, c81
+   LD b2,  BO,   9 * SIZE
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c81, c21, b8, c81
+   LD b3,  BO,  18 * SIZE
+   LD b4,  BO,  19 * SIZE
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c81, c31, b8, c81
+   LD b4,  BO,  27 * SIZE
+   LD b5,  BO,  28 * SIZE
+   LD b6,  BO,  29 * SIZE
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL c41, b4, c41
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c81, c41, b8, c81
+   LD b5,  BO,  36 * SIZE
+   LD b6,  BO,  37 * SIZE
+   LD b7,  BO,  38 * SIZE
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c81, c51, b8, c81
+   LD b6,  BO,  45 * SIZE
+   LD b7,  BO,  46 * SIZE
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c81, c61, b8, c81
+   LD b7,  BO,  54 * SIZE
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   NMSUB  c81, c71, b8, c81
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   LD b5,  BO,  59 * SIZE
+   LD b6,  BO,  58 * SIZE
+   LD b7,  BO,  57 * SIZE
+   LD b8,  BO,  56 * SIZE
+   MUL c81, b1, c81
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c11, c81, b8, c11
+   LD b2,  BO,  54 * SIZE
+   LD b3,  BO,  53 * SIZE
+   LD b4,  BO,  52 * SIZE
+   LD b5,  BO,  51 * SIZE
+   LD b6,  BO,  50 * SIZE
+   LD b7,  BO,  49 * SIZE
+   LD b8,  BO,  48 * SIZE
+   MUL c71, b2, c71
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c11, c71, b8, c11
+   LD b3,  BO,  45 * SIZE
+   LD b4,  BO,  44 * SIZE
+   LD b5,  BO,  43 * SIZE
+   LD b6,  BO,  42 * SIZE
+   LD b7,  BO,  41 * SIZE
+   LD b8,  BO,  40 * SIZE
+   MUL c61, b3, c61
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c11, c61, b8, c11
+   LD b4,  BO,  36 * SIZE
+   LD b5,  BO,  35 * SIZE
+   LD b6,  BO,  34 * SIZE
+   LD b7,  BO,  33 * SIZE
+   LD b8,  BO,  32 * SIZE
+   MUL c51, b4, c51
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c11, c51, b8, c11
+   LD b5,  BO,  27 * SIZE
+   LD b6,  BO,  26 * SIZE
+   LD b7,  BO,  25 * SIZE
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  18 * SIZE
+   LD b7,  BO,  17 * SIZE
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+   addi.d  CO5, CO5, -1 * SIZE
+   addi.d  CO6, CO6, -1 * SIZE
+   addi.d  CO7, CO7, -1 * SIZE
+   addi.d  CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c61,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c81,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+   addi.d  CO5, CO5, 1 * SIZE
+   addi.d  CO6, CO6, 1 * SIZE
+   addi.d  CO7, CO7, 1 * SIZE
+   addi.d  CO8, CO8, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  8
+#endif
+#ifdef RT
+   addi.d  KK, KK, -8
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+#ifdef RT
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   srai.d  I,  M, 1
+   MOV c31, c11
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c12, b5, c12
+   SUB c22, b6, c22
+   SUB c32, b7, c32
+   SUB c42, b8, c42
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+   SUB c31, b5, c31
+   SUB c32, b6, c32
+   SUB c41, b7, c41
+   SUB c42, b8, c42
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+   MUL c31, b3, c31
+   MUL c41, b3, c41
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+   addi.d  CO3, CO3, -2 * SIZE
+   addi.d  CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c12,  BO,   4 * SIZE
+   ST c22,  BO,   5 * SIZE
+   ST c32,  BO,   6 * SIZE
+   ST c42,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L49:
+#ifdef LN
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+#else
+   move    AO, A
+#endif
+   bge $r0,    J, .L70
+#ifdef RT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+   move    AO, A
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L60
+.L51:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c12, b3, c12
+   SUB c22, b4, c22
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   MUL c21, b3, c21
+   MUL c22, b3, c22
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   MUL c22, b1, c22
+   NMSUB  c11, c21, b2, c11
+   NMSUB  c12, c22, b2, c12
+   MUL c11, b3, c11
+   MUL c12, b3, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c12,  BO,   2 * SIZE
+   ST c22,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L51
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+#if defined(LT) || defined(RN)
+   srai.d  L,  KK, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   srai.d  L,  TEMP, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+   LD b3,  AO,   0 * SIZE
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   MUL c21, b3, c21
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   NMSUB  c11, c21, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 0 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L69:
+#ifdef LN
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+   bge $r0,    J, .L999
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+   move    AO, A
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L80
+.L71:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   NMSUB  c11, c12, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c12, c11, b2, c12
+   MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L71
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   SUB c11, b1, c11
+#else
+   LD b1,  AO,   0 * SIZE
+   SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L89:
+#ifdef LN
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   LDARG  $r29,  $sp,   88
+   LDARG  $r30,  $sp,   96
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 112
+   fld.d  $f19,  $sp, 120
+   fld.d  $f20,  $sp, 128
+   fld.d  $f21,  $sp, 136
+#endif
+   addi.d  $sp, $sp, 144
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S
new file mode 100644
index 000000000..c86d9c1e5
--- /dev/null
+++ b/kernel/loongarch64/trsm_kernel_RT.S
@@ -0,0 +1,2850 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r29
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+#define KK     $r30
+#define TEMP   $r20
+#define AORIG  $r16
+#define a1     $f22
+#define a2     $f8
+#define a3     $f27
+#define a4     $f28
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f1
+#define c31    $f2
+#define c32    $f4
+#define c41    $f5
+#define c42    $f6
+#define c51    $f7
+#define c52    $f18
+#define c61    $f19
+#define c62    $f20
+#define c71    $f21
+#define c72    $f24
+#define c81    $f25
+#define c82    $f26
+#define ALPHA  $f0
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -144
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   SDARG  $r29,  $sp,   88
+   SDARG  $r30,  $sp,   96
+   SDARG  $r20,  $sp,  104
+   SDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fst.d  $f18,  $sp, 112
+   fst.d  $f19,  $sp, 120
+   fst.d  $f20,  $sp, 128
+   fst.d  $f21,  $sp, 136
+#endif
+   slli.d     LDC,    LDC, BASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, BASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, BASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   andi    J,  N, 1
+   bge $r0,    J, .L30
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+   move    AO, A
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L80
+.L71:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L75
+#endif
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   ADD c11, c11, c21
+   ADD c12, c12, c22
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   NMSUB  c11, c12, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c12, c11, b2, c12
+   MUL c12, b3, c12
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L71
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   MOV c21, c11
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L85
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MOV c21, c11
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L85
+#endif
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   ADD c11, c11, c21
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -1
+#endif
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   SUB c11, b1, c11
+#else
+   LD b1,  AO,   0 * SIZE
+   SUB c11, b1, c11
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   MUL c11, b1, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L89:
+#ifdef LN
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L30:
+   andi    J,  N, 2
+   bge $r0,    J, .L50
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+   move    AO, A
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   srai.d  I,  M, 1
+   bge $r0,    I, .L60
+.L51:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L55
+#endif
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c12, b3, c12
+   SUB c22, b4, c22
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   MUL c21, b3, c21
+   MUL c22, b3, c22
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   MUL c22, b1, c22
+   NMSUB  c11, c21, b2, c11
+   NMSUB  c12, c22, b2, c12
+   MUL c11, b3, c11
+   MUL c12, b3, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c12,  BO,   2 * SIZE
+   ST c22,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L51
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+#if defined(LT) || defined(RN)
+   srai.d  L,  KK, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   srai.d  L,  TEMP, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L65
+#endif
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+#endif
+#if defined(LN) || defined(LT)
+   LD b3,  AO,   0 * SIZE
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   MUL c21, b3, c21
+#endif
+#ifdef RT
+   LD b1,  BO,   3 * SIZE
+   LD b2,  BO,   2 * SIZE
+   LD b3,  BO,   0 * SIZE
+   MUL c21, b1, c21
+   NMSUB  c11, c21, b2, c11
+   MUL c11, b3, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 0 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L69:
+#ifdef LN
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L50:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L70
+#ifdef RT
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   srai.d  I,  M, 1
+   MOV c31, c11
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  KK, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c32, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c42, c11
+   LD b5,  BO,   4 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c12, b5, c12
+   SUB c22, b6, c22
+   SUB c32, b7, c32
+   SUB c42, b8, c42
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c21, b3, c21
+   SUB c22, b4, c22
+   SUB c31, b5, c31
+   SUB c32, b6, c32
+   SUB c41, b7, c41
+   SUB c42, b8, c42
+#endif
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+   LD b2,  AO,   2 * SIZE
+   LD b3,  AO,   0 * SIZE
+   MUL c12, b1, c12
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   NMSUB  c11, c12, b2, c11
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   MUL c11, b3, c11
+   MUL c21, b3, c21
+   MUL c31, b3, c31
+   MUL c41, b3, c41
+#endif
+#ifdef LT
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   NMSUB  c12, c11, b2, c12
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -2 * SIZE
+   addi.d  CO2, CO2, -2 * SIZE
+   addi.d  CO3, CO3, -2 * SIZE
+   addi.d  CO4, CO4, -2 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c12,  BO,   4 * SIZE
+   ST c22,  BO,   5 * SIZE
+   ST c32,  BO,   6 * SIZE
+   ST c42,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+MTC  a1, $r0
+   MOV c11, a1
+   MOV c21, a1
+   MOV c31, a1
+   addi.d  I, I, -1
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+move   BO,  B
+   bge $r0,    L, .L45
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L45
+#endif
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   LD b2,  BO,   5 * SIZE
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   LD b4,  BO,  15 * SIZE
+   MUL c41, b4, c41
+#endif
+#ifdef RT
+   LD b5,  BO,  15 * SIZE
+   LD b6,  BO,  14 * SIZE
+   LD b7,  BO,  13 * SIZE
+   LD b8,  BO,  12 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  10 * SIZE
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   5 * SIZE
+   LD b8,  BO,   4 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L49:
+#ifdef LN
+   slli.d  TEMP, K, 2 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   .align 3
+
+.L70:
+   srai.d  J,  N, 3
+nop
+   bge $r0,    J, .L999
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 3
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO8,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  1 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 1 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#endif
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -2
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  BO,   4 * SIZE
+   SUB c21, b2, c21
+   LD b6,  BO,   5 * SIZE
+   SUB c31, b3, c31
+   LD b7,  BO,   6 * SIZE
+   SUB c41, b4, c41
+   LD b8,  BO,   7 * SIZE
+   SUB c51, b5, c51
+   LD b1,  BO,   8 * SIZE
+   SUB c61, b6, c61
+   LD b2,  BO,   9 * SIZE
+   SUB c71, b7, c71
+   LD b3,  BO,  10 * SIZE
+   SUB c81, b8, c81
+   LD b4,  BO,  11 * SIZE
+   SUB c12, b1, c12
+   LD b5,  BO,  12 * SIZE
+   SUB c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   SUB c32, b3, c32
+   LD b7,  BO,  14 * SIZE
+   SUB c42, b4, c42
+   LD b8,  BO,  15 * SIZE
+   SUB c52, b5, c52
+#ifdef LN
+   LD b1,  AO,   3 * SIZE
+#else
+   LD b1,  AO,   0 * SIZE
+#endif
+   SUB c62, b6, c62
+   SUB c72, b7, c72
+   SUB c82, b8, c82
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   LD b5,  AO,   4 * SIZE
+   SUB c12, b2, c12
+   LD b6,  AO,   5 * SIZE
+   SUB c21, b3, c21
+   LD b7,  AO,   6 * SIZE
+   SUB c22, b4, c22
+   LD b8,  AO,   7 * SIZE
+   SUB c31, b5, c31
+   LD b1,  AO,   8 * SIZE
+   SUB c32, b6, c32
+   LD b2,  AO,   9 * SIZE
+   SUB c41, b7, c41
+   LD b3,  AO,  10 * SIZE
+   SUB c42, b8, c42
+   LD b4,  AO,  11 * SIZE
+   LD b5,  AO,  12 * SIZE
+   SUB c51, b1, c51
+   LD b6,  AO,  13 * SIZE
+   SUB c52, b2, c52
+   LD b7,  AO,  14 * SIZE
+   SUB c61, b3, c61
+   LD b8,  AO,  15 * SIZE
+   SUB c62, b4, c62
+   SUB c71, b5, c71
+   SUB c72, b6, c72
+   SUB c81, b7, c81
+   SUB c82, b8, c82
+#endif
+#ifdef LN
+   MUL c12, b1, c12
+   LD b2,  AO,   2 * SIZE
+   MUL c22, b1, c22
+   MUL c32, b1, c32
+   MUL c42, b1, c42
+   MUL c52, b1, c52
+   MUL c62, b1, c62
+   MUL c72, b1, c72
+   MUL c82, b1, c82
+   NMSUB  c11, c12, b2, c11
+   LD b3,  AO,   0 * SIZE
+   NMSUB  c21, c22, b2, c21
+   NMSUB  c31, c32, b2, c31
+   NMSUB  c41, c42, b2, c41
+   NMSUB  c51, c52, b2, c51
+   NMSUB  c61, c62, b2, c61
+   NMSUB  c71, c72, b2, c71
+   NMSUB  c81, c82, b2, c81
+   MUL c11, b3, c11
+   addi.d  CO1, CO1, -2 * SIZE
+   MUL c21, b3, c21
+   addi.d  CO2, CO2, -2 * SIZE
+   MUL c31, b3, c31
+   addi.d  CO3, CO3, -2 * SIZE
+   MUL c41, b3, c41
+   addi.d  CO4, CO4, -2 * SIZE
+   MUL c51, b3, c51
+   addi.d  CO5, CO5, -2 * SIZE
+   MUL c61, b3, c61
+   addi.d  CO6, CO6, -2 * SIZE
+   MUL c71, b3, c71
+   addi.d  CO7, CO7, -2 * SIZE
+   MUL c81, b3, c81
+   addi.d  CO8, CO8, -2 * SIZE
+#endif
+#ifdef LT
+   MUL c11, b1, c11
+   LD b2,  AO,   1 * SIZE
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+   NMSUB  c12, c11, b2, c12
+   LD b3,  AO,   3 * SIZE
+   NMSUB  c22, c21, b2, c22
+   NMSUB  c32, c31, b2, c32
+   NMSUB  c42, c41, b2, c42
+   NMSUB  c52, c51, b2, c52
+   NMSUB  c62, c61, b2, c62
+   NMSUB  c72, c71, b2, c72
+   NMSUB  c82, c81, b2, c82
+   MUL c12, b3, c12
+   MUL c22, b3, c22
+   MUL c32, b3, c32
+   MUL c42, b3, c42
+   MUL c52, b3, c52
+   MUL c62, b3, c62
+   MUL c72, b3, c72
+   MUL c82, b3, c82
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL c11, b1, c11
+   MUL c12, b1, c12
+   LD b5,  BO,   4 * SIZE
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c22, c12, b2, c22
+   LD b6,  BO,   5 * SIZE
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c32, c12, b3, c32
+   LD b7,  BO,   6 * SIZE
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c42, c12, b4, c42
+   LD b8,  BO,   7 * SIZE
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c52, c12, b5, c52
+   LD b2,  BO,   9 * SIZE
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c62, c12, b6, c62
+   LD b3,  BO,  10 * SIZE
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c72, c12, b7, c72
+   LD b4,  BO,  11 * SIZE
+   NMSUB  c81, c11, b8, c81
+   NMSUB  c82, c12, b8, c82
+   LD b5,  BO,  12 * SIZE
+   MUL c21, b2, c21
+   MUL c22, b2, c22
+   LD b6,  BO,  13 * SIZE
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c32, c22, b3, c32
+   LD b7,  BO,  14 * SIZE
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c42, c22, b4, c42
+   LD b8,  BO,  15 * SIZE
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c52, c22, b5, c52
+   LD b3,  BO,  18 * SIZE
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c62, c22, b6, c62
+   LD b4,  BO,  19 * SIZE
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c72, c22, b7, c72
+   LD b5,  BO,  20 * SIZE
+   NMSUB  c81, c21, b8, c81
+   NMSUB  c82, c22, b8, c82
+   LD b6,  BO,  21 * SIZE
+   MUL c31, b3, c31
+   MUL c32, b3, c32
+   LD b7,  BO,  22 * SIZE
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c42, c32, b4, c42
+   LD b8,  BO,  23 * SIZE
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c52, c32, b5, c52
+   LD b4,  BO,  27 * SIZE
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c62, c32, b6, c62
+   LD b5,  BO,  28 * SIZE
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c72, c32, b7, c72
+   LD b6,  BO,  29 * SIZE
+   NMSUB  c81, c31, b8, c81
+   NMSUB  c82, c32, b8, c82
+   LD b7,  BO,  30 * SIZE
+   MUL c41, b4, c41
+   MUL c42, b4, c42
+   LD b8,  BO,  31 * SIZE
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c52, c42, b5, c52
+   LD b5,  BO,  36 * SIZE
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c62, c42, b6, c62
+   LD b6,  BO,  37 * SIZE
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c72, c42, b7, c72
+   LD b7,  BO,  38 * SIZE
+   NMSUB  c81, c41, b8, c81
+   NMSUB  c82, c42, b8, c82
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   MUL c52, b5, c52
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c62, c52, b6, c62
+   LD b6,  BO,  45 * SIZE
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  46 * SIZE
+   NMSUB  c81, c51, b8, c81
+   NMSUB  c82, c52, b8, c82
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   MUL c62, b6, c62
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c72, c62, b7, c72
+   LD b7,  BO,  54 * SIZE
+   NMSUB  c81, c61, b8, c81
+   NMSUB  c82, c62, b8, c82
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   MUL c72, b7, c72
+   NMSUB  c81, c71, b8, c81
+   NMSUB  c82, c72, b8, c82
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+   MUL c82, b8, c82
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   MUL c81, b1, c81
+   MUL c82, b1, c82
+   LD b5,  BO,  59 * SIZE
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c72, c82, b2, c72
+   LD b6,  BO,  58 * SIZE
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c62, c82, b3, c62
+   LD b7,  BO,  57 * SIZE
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c52, c82, b4, c52
+   LD b8,  BO,  56 * SIZE
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c42, c82, b5, c42
+   LD b2,  BO,  54 * SIZE
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c32, c82, b6, c32
+   LD b3,  BO,  53 * SIZE
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c22, c82, b7, c22
+   LD b4,  BO,  52 * SIZE
+   NMSUB  c11, c81, b8, c11
+   NMSUB  c12, c82, b8, c12
+   LD b5,  BO,  51 * SIZE
+   MUL c71, b2, c71
+   MUL c72, b2, c72
+   LD b6,  BO,  50 * SIZE
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c62, c72, b3, c62
+   LD b7,  BO,  49 * SIZE
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c52, c72, b4, c52
+   LD b8,  BO,  48 * SIZE
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c42, c72, b5, c42
+   LD b3,  BO,  45 * SIZE
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c32, c72, b6, c32
+   LD b4,  BO,  44 * SIZE
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c22, c72, b7, c22
+   LD b5,  BO,  43 * SIZE
+   NMSUB  c11, c71, b8, c11
+   NMSUB  c12, c72, b8, c12
+   LD b6,  BO,  42 * SIZE
+   MUL c61, b3, c61
+   MUL c62, b3, c62
+   LD b7,  BO,  41 * SIZE
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c52, c62, b4, c52
+   LD b8,  BO,  40 * SIZE
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c42, c62, b5, c42
+   LD b4,  BO,  36 * SIZE
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c32, c62, b6, c32
+   LD b5,  BO,  35 * SIZE
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c22, c62, b7, c22
+   LD b6,  BO,  34 * SIZE
+   NMSUB  c11, c61, b8, c11
+   NMSUB  c12, c62, b8, c12
+   LD b7,  BO,  33 * SIZE
+   MUL c51, b4, c51
+   MUL c52, b4, c52
+   LD b8,  BO,  32 * SIZE
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c42, c52, b5, c42
+   LD b5,  BO,  27 * SIZE
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c32, c52, b6, c32
+   LD b6,  BO,  26 * SIZE
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c22, c52, b7, c22
+   LD b7,  BO,  25 * SIZE
+   NMSUB  c11, c51, b8, c11
+   NMSUB  c12, c52, b8, c12
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   MUL c42, b5, c42
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c32, c42, b6, c32
+   LD b6,  BO,  18 * SIZE
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c22, c42, b7, c22
+   LD b7,  BO,  17 * SIZE
+   NMSUB  c11, c41, b8, c11
+   NMSUB  c12, c42, b8, c12
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   MUL c32, b6, c32
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c22, c32, b7, c22
+   LD b7,  BO,   9 * SIZE
+   NMSUB  c11, c31, b8, c11
+   NMSUB  c12, c32, b8, c12
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   MUL c22, b7, c22
+   NMSUB  c11, c21, b8, c11
+   NMSUB  c12, c22, b8, c12
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+   MUL c12, b8, c12
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+   ST c12,  BO,   8 * SIZE
+   ST c22,  BO,   9 * SIZE
+   ST c32,  BO,  10 * SIZE
+   ST c42,  BO,  11 * SIZE
+   ST c52,  BO,  12 * SIZE
+   ST c62,  BO,  13 * SIZE
+   ST c72,  BO,  14 * SIZE
+   ST c82,  BO,  15 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c21,  AO,   2 * SIZE
+   ST c22,  AO,   3 * SIZE
+   ST c31,  AO,   4 * SIZE
+   ST c32,  AO,   5 * SIZE
+   ST c41,  AO,   6 * SIZE
+   ST c42,  AO,   7 * SIZE
+   ST c51,  AO,   8 * SIZE
+   ST c52,  AO,   9 * SIZE
+   ST c61,  AO,  10 * SIZE
+   ST c62,  AO,  11 * SIZE
+   ST c71,  AO,  12 * SIZE
+   ST c72,  AO,  13 * SIZE
+   ST c81,  AO,  14 * SIZE
+   ST c82,  AO,  15 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c22,  CO2,   1 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c32,  CO3,   1 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c42,  CO4,   1 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c52,  CO5,   1 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c62,  CO6,   1 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c72,  CO7,   1 * SIZE
+   ST c81,  CO8,   0 * SIZE
+   ST c82,  CO8,   1 * SIZE
+MTC  a1, $r0
+#ifndef LN
+   addi.d  CO1, CO1, 2 * SIZE
+   addi.d  CO2, CO2, 2 * SIZE
+   addi.d  CO3, CO3, 2 * SIZE
+   addi.d  CO4, CO4, 2 * SIZE
+   addi.d  CO5, CO5, 2 * SIZE
+   addi.d  CO6, CO6, 2 * SIZE
+   addi.d  CO7, CO7, 2 * SIZE
+   addi.d  CO8, CO8, 2 * SIZE
+#endif
+   MOV c11, a1
+   MOV c21, a1
+#ifdef RT
+   slli.d  TEMP, K, 1 + BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+   MOV c31, a1
+   MOV c41, a1
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 1 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 2
+#endif
+#ifdef LN
+   addi.d  KK, KK, -2
+#endif
+   addi.d  I, I, -1
+   MOV c51, a1
+MOV    c61, a1
+   blt $r0,    I, .L11
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  KK, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  0 + BASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, 0 + BASE_SHIFT
+   slli.d  TEMP, KK, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   8 * SIZE
+   LD b7,  BO,  12 * SIZE
+   srai.d  L,  TEMP, 2
+   MOV c81, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -8
+#endif
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c21, b2, c21
+   SUB c31, b3, c31
+   SUB c41, b4, c41
+   SUB c51, b5, c51
+   SUB c61, b6, c61
+   SUB c71, b7, c71
+   SUB c81, b8, c81
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   MUL c11, b1, c11
+   MUL c21, b1, c21
+   MUL c31, b1, c31
+   MUL c41, b1, c41
+   MUL c51, b1, c51
+   MUL c61, b1, c61
+   MUL c71, b1, c71
+   MUL c81, b1, c81
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL c11, b1, c11
+   NMSUB  c21, c11, b2, c21
+   NMSUB  c31, c11, b3, c31
+   NMSUB  c41, c11, b4, c41
+   NMSUB  c51, c11, b5, c51
+   NMSUB  c61, c11, b6, c61
+   NMSUB  c71, c11, b7, c71
+   NMSUB  c81, c11, b8, c81
+   LD b2,  BO,   9 * SIZE
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL c21, b2, c21
+   NMSUB  c31, c21, b3, c31
+   NMSUB  c41, c21, b4, c41
+   NMSUB  c51, c21, b5, c51
+   NMSUB  c61, c21, b6, c61
+   NMSUB  c71, c21, b7, c71
+   NMSUB  c81, c21, b8, c81
+   LD b3,  BO,  18 * SIZE
+   LD b4,  BO,  19 * SIZE
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL c31, b3, c31
+   NMSUB  c41, c31, b4, c41
+   NMSUB  c51, c31, b5, c51
+   NMSUB  c61, c31, b6, c61
+   NMSUB  c71, c31, b7, c71
+   NMSUB  c81, c31, b8, c81
+   LD b4,  BO,  27 * SIZE
+   LD b5,  BO,  28 * SIZE
+   LD b6,  BO,  29 * SIZE
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL c41, b4, c41
+   NMSUB  c51, c41, b5, c51
+   NMSUB  c61, c41, b6, c61
+   NMSUB  c71, c41, b7, c71
+   NMSUB  c81, c41, b8, c81
+   LD b5,  BO,  36 * SIZE
+   LD b6,  BO,  37 * SIZE
+   LD b7,  BO,  38 * SIZE
+   LD b8,  BO,  39 * SIZE
+   MUL c51, b5, c51
+   NMSUB  c61, c51, b6, c61
+   NMSUB  c71, c51, b7, c71
+   NMSUB  c81, c51, b8, c81
+   LD b6,  BO,  45 * SIZE
+   LD b7,  BO,  46 * SIZE
+   LD b8,  BO,  47 * SIZE
+   MUL c61, b6, c61
+   NMSUB  c71, c61, b7, c71
+   NMSUB  c81, c61, b8, c81
+   LD b7,  BO,  54 * SIZE
+   LD b8,  BO,  55 * SIZE
+   MUL c71, b7, c71
+   NMSUB  c81, c71, b8, c81
+   LD b8,  BO,  63 * SIZE
+   MUL c81, b8, c81
+#endif
+#ifdef RT
+   LD b1,  BO,  63 * SIZE
+   LD b2,  BO,  62 * SIZE
+   LD b3,  BO,  61 * SIZE
+   LD b4,  BO,  60 * SIZE
+   LD b5,  BO,  59 * SIZE
+   LD b6,  BO,  58 * SIZE
+   LD b7,  BO,  57 * SIZE
+   LD b8,  BO,  56 * SIZE
+   MUL c81, b1, c81
+   NMSUB  c71, c81, b2, c71
+   NMSUB  c61, c81, b3, c61
+   NMSUB  c51, c81, b4, c51
+   NMSUB  c41, c81, b5, c41
+   NMSUB  c31, c81, b6, c31
+   NMSUB  c21, c81, b7, c21
+   NMSUB  c11, c81, b8, c11
+   LD b2,  BO,  54 * SIZE
+   LD b3,  BO,  53 * SIZE
+   LD b4,  BO,  52 * SIZE
+   LD b5,  BO,  51 * SIZE
+   LD b6,  BO,  50 * SIZE
+   LD b7,  BO,  49 * SIZE
+   LD b8,  BO,  48 * SIZE
+   MUL c71, b2, c71
+   NMSUB  c61, c71, b3, c61
+   NMSUB  c51, c71, b4, c51
+   NMSUB  c41, c71, b5, c41
+   NMSUB  c31, c71, b6, c31
+   NMSUB  c21, c71, b7, c21
+   NMSUB  c11, c71, b8, c11
+   LD b3,  BO,  45 * SIZE
+   LD b4,  BO,  44 * SIZE
+   LD b5,  BO,  43 * SIZE
+   LD b6,  BO,  42 * SIZE
+   LD b7,  BO,  41 * SIZE
+   LD b8,  BO,  40 * SIZE
+   MUL c61, b3, c61
+   NMSUB  c51, c61, b4, c51
+   NMSUB  c41, c61, b5, c41
+   NMSUB  c31, c61, b6, c31
+   NMSUB  c21, c61, b7, c21
+   NMSUB  c11, c61, b8, c11
+   LD b4,  BO,  36 * SIZE
+   LD b5,  BO,  35 * SIZE
+   LD b6,  BO,  34 * SIZE
+   LD b7,  BO,  33 * SIZE
+   LD b8,  BO,  32 * SIZE
+   MUL c51, b4, c51
+   NMSUB  c41, c51, b5, c41
+   NMSUB  c31, c51, b6, c31
+   NMSUB  c21, c51, b7, c21
+   NMSUB  c11, c51, b8, c11
+   LD b5,  BO,  27 * SIZE
+   LD b6,  BO,  26 * SIZE
+   LD b7,  BO,  25 * SIZE
+   LD b8,  BO,  24 * SIZE
+   MUL c41, b5, c41
+   NMSUB  c31, c41, b6, c31
+   NMSUB  c21, c41, b7, c21
+   NMSUB  c11, c41, b8, c11
+   LD b6,  BO,  18 * SIZE
+   LD b7,  BO,  17 * SIZE
+   LD b8,  BO,  16 * SIZE
+   MUL c31, b6, c31
+   NMSUB  c21, c31, b7, c21
+   NMSUB  c11, c31, b8, c11
+   LD b7,  BO,   9 * SIZE
+   LD b8,  BO,   8 * SIZE
+   MUL c21, b7, c21
+   NMSUB  c11, c21, b8, c11
+   LD b8,  BO,   0 * SIZE
+   MUL c11, b8, c11
+#endif
+#ifdef LN
+   addi.d  CO1, CO1, -1 * SIZE
+   addi.d  CO2, CO2, -1 * SIZE
+   addi.d  CO3, CO3, -1 * SIZE
+   addi.d  CO4, CO4, -1 * SIZE
+   addi.d  CO5, CO5, -1 * SIZE
+   addi.d  CO6, CO6, -1 * SIZE
+   addi.d  CO7, CO7, -1 * SIZE
+   addi.d  CO8, CO8, -1 * SIZE
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c21,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c41,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c61,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c81,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c21,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c41,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c61,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c81,  AO,   7 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c21,  CO2,   0 * SIZE
+   ST c31,  CO3,   0 * SIZE
+   ST c41,  CO4,   0 * SIZE
+   ST c51,  CO5,   0 * SIZE
+   ST c61,  CO6,   0 * SIZE
+   ST c71,  CO7,   0 * SIZE
+   ST c81,  CO8,   0 * SIZE
+#ifndef LN
+   addi.d  CO1, CO1, 1 * SIZE
+   addi.d  CO2, CO2, 1 * SIZE
+   addi.d  CO3, CO3, 1 * SIZE
+   addi.d  CO4, CO4, 1 * SIZE
+   addi.d  CO5, CO5, 1 * SIZE
+   addi.d  CO6, CO6, 1 * SIZE
+   addi.d  CO7, CO7, 1 * SIZE
+   addi.d  CO8, CO8, 1 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, BASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, 0 + BASE_SHIFT
+   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 3 + BASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  8
+#endif
+#ifdef RT
+   addi.d  KK, KK, -8
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   LDARG  $r29,  $sp,   88
+   LDARG  $r30,  $sp,   96
+   LDARG  $r20,  $sp,  104
+   LDARG  $r16,  $sp,  112
+#ifndef __64BIT__
+   fld.d  $f18,  $sp, 112
+   fld.d  $f19,  $sp, 120
+   fld.d  $f20,  $sp, 128
+   fld.d  $f21,  $sp, 136
+#endif
+   addi.d  $sp, $sp, 144
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S
new file mode 100644
index 000000000..f998bdc23
--- /dev/null
+++ b/kernel/loongarch64/zamax.S
@@ -0,0 +1,190 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define t5     $f4
+#define t6     $f5
+#define t7     $f6
+#define t8     $f7
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD    s1, t1, t2
+   bge $r0,    N, .L999
+   ADD s2, t1, t2
+   srai.d  I, N, 2
+   ADD s3, t1, t2
+   ADD    s4, t1, t2
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t3
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, s3, t5
+   add.d   X, X, INCX
+   CMPLT   $fcc3, s4, t7
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t3,  $fcc1
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t3
+   CMPLT   $fcc2, s3, t5
+   CMPLT   $fcc3, s4, t7
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t3,  $fcc1
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   CMPLT   $fcc0, s1, t1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S
new file mode 100644
index 000000000..bde9aebf8
--- /dev/null
+++ b/kernel/loongarch64/zamin.S
@@ -0,0 +1,198 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+#define I     $r17
+#define TEMP  $r18
+#define a1    $f10
+#define a2    $f11
+#define a3    $f12
+#define a4    $f13
+#define a5    $f14
+#define a6    $f15
+#define a7    $f16
+#define a8    $f17
+#define t1    $f0
+#define t2    $f1
+#define t3    $f2
+#define t4    $f3
+#define t5    $f4
+#define t6    $f5
+#define t7    $f6
+#define t8    $f7
+#define s1    $f22
+#define s2    $f8
+#define s3    $f23
+#define s4    $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   LD a1,  X,   0 * SIZE
+   addi.d  N, N, -1
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD    s1, t1, t2
+   bge $r0,    N, .L999
+   NOP
+   ADD s2, t1, t2
+   srai.d  I, N, 2
+   ADD s3, t1, t2
+   ADD    s4, t1, t2
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   LD a2,  X,   1 * SIZE
+   FABS    t3, a3
+   add.d   X, X, INCX
+   FABS    t4, a4
+   NOP
+   FABS    t5, a5
+   LD a3,  X,   0 * SIZE
+   FABS    t6, a6
+   LD a4,  X,   1 * SIZE
+   FABS    t7, a7
+   add.d   X, X, INCX
+   FABS    t8, a8
+   NOP
+   ADD t1, t1, t2
+   LD a5,  X,   0 * SIZE
+   ADD t3, t3, t4
+   LD a6,  X,   1 * SIZE
+   ADD t5, t5, t6
+   add.d   X, X, INCX
+   ADD t7, t7, t8
+   NOP
+   CMPLT   $fcc0, t1, s1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, t3, s2
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc2, t5, s3
+   add.d   X, X, INCX
+   CMPLT   $fcc3, t7, s4
+   NOP
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t3,  $fcc1
+   NOP
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   blt $r0,    I, .L12
+   NOP
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   FABS    t5, a5
+   FABS    t6, a6
+   FABS    t7, a7
+   FABS    t8, a8
+   ADD t1, t1, t2
+   ADD t3, t3, t4
+   ADD t5, t5, t6
+   ADD t7, t7, t8
+   CMPLT   $fcc0, t1, s1
+   CMPLT   $fcc1, t3, s2
+   CMPLT   $fcc2, t5, s3
+   CMPLT   $fcc3, t7, s4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t3,  $fcc1
+   CMOVT  s3,  s3,  t5,  $fcc2
+   CMOVT  s4,  s4,  t7,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   FABS    t2, a2
+   ADD t1, t1, t2
+   CMPLT   $fcc0, t1, s1
+   CMOVT  s1,  s1,  t1,  $fcc0
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L998:
+   CMPLT   $fcc0, s2, s1
+   CMPLT   $fcc1, s4, s3
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s3, s1
+   CMOVT  s1,  s1,  s3,  $fcc0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   NOP
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S
new file mode 100644
index 000000000..d1a1a732c
--- /dev/null
+++ b/kernel/loongarch64/zasum.S
@@ -0,0 +1,158 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f23
+#define a2     $f9
+#define a3     $f10
+#define a4     $f11
+#define a5     $f12
+#define a6     $f13
+#define a7     $f14
+#define a8     $f15
+#define t1     $f16
+#define t2     $f17
+#define t3     $f0
+#define t4     $f1
+#define s1     $f22
+#define s2     $f8
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   MTC  s2, $r0
+   slli.d  INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bge $r0,    N, .L999
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   FABS    t1, a1
+   FABS    t2, a2
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   FABS    t3, a3
+   FABS    t4, a4
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   ADD s1, s1, t1
+   LD a1,  X,   0 * SIZE
+   FABS    t1, a5
+   addi.d  I, I, -1
+   ADD s2, s2, t2
+   LD a2,  X,   1 * SIZE
+   FABS    t2, a6
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a3,  X,   0 * SIZE
+   FABS    t3, a7
+   NOP
+   ADD s2, s2, t4
+   LD a4,  X,   1 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   ADD s1, s1, t1
+   LD a5,  X,   0 * SIZE
+   FABS    t1, a1
+   NOP
+   ADD s2, s2, t2
+   LD a6,  X,   1 * SIZE
+   FABS    t2, a2
+   add.d   X, X, INCX
+   ADD s1, s1, t3
+   LD a7,  X,   0 * SIZE
+   FABS    t3, a3
+   LD a8,  X,   1 * SIZE
+   ADD s2, s2, t4
+   add.d   X, X, INCX
+   FABS   t4, a4
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   ADD s1, s1, t1
+   FABS    t1, a5
+   ADD s2, s2, t2
+   FABS    t2, a6
+   ADD s1, s1, t3
+   FABS    t3, a7
+   ADD s2, s2, t4
+   FABS    t4, a8
+   ADD s1, s1, t1
+   ADD s2, s2, t2
+   ADD s1, s1, t3
+   ADD s2, s2, t4
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   FABS    t1, a1
+   addi.d  I, I, -1
+   FABS    t2, a2
+   add.d   X, X, INCX
+   ADD s1, s1, t1
+   ADD    s2, s2, t2
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   ADD s1, s1, s2
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S
new file mode 100644
index 000000000..0f480ca85
--- /dev/null
+++ b/kernel/loongarch64/zcopy.S
@@ -0,0 +1,217 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f22
+#define a2     $f8
+#define a3     $f23
+#define a4     $f9
+#define a5     $f10
+#define a6     $f11
+#define a7     $f12
+#define a8     $f13
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   li.d  TEMP, 2 * SIZE
+   NOP
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bne INCX, TEMP, .L20
+   srai.d I, N, 2
+   bne INCY, TEMP, .L20
+   addi.d I, I, -1
+   blt     I, $r0,     .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   LD a7,  X,   6 * SIZE
+   LD a8,  X,   7 * SIZE
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   8 * SIZE
+   ST a2,  Y,   1 * SIZE
+   LD a2,  X,   9 * SIZE
+   ST a3,  Y,   2 * SIZE
+   LD a3,  X,  10 * SIZE
+   ST a4,  Y,   3 * SIZE
+   LD a4,  X,  11 * SIZE
+   ST a5,  Y,   4 * SIZE
+   LD a5,  X,  12 * SIZE
+   ST a6,  Y,   5 * SIZE
+   LD a6,  X,  13 * SIZE
+   ST a7,  Y,   6 * SIZE
+   LD a7,  X,  14 * SIZE
+   ST a8,  Y,   7 * SIZE
+   LD a8,  X,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   ST a3,  Y,   2 * SIZE
+   ST a4,  Y,   3 * SIZE
+   ST a5,  Y,   4 * SIZE
+   ST a6,  Y,   5 * SIZE
+   ST a7,  Y,   6 * SIZE
+   ST a8,  Y,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   addi.d  Y, Y, 8 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  X, X, 2 * SIZE
+   addi.d  Y, Y, 2 * SIZE
+   ST a1,  Y,  -2 * SIZE
+   addi.d  I, I, -1
+   ST a2,  Y,  -1 * SIZE
+   blt $r0,    I, .L16
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   NOP
+   .align 3
+
+.L20:
+   srai.d  I, N, 2
+   addi.d I, I, -1
+   blt I,  $r0, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   add.d  X, X, INCX
+   bge $r0,    I, .L23
+   .align 3
+
+.L22:
+   ST a1,  Y,   0 * SIZE
+   LD a1,  X,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a3,  Y,   0 * SIZE
+   LD a3,  X,   0 * SIZE
+   ST a4,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a5,  Y,   0 * SIZE
+   LD a5,  X,   0 * SIZE
+   ST a6,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a7,  Y,   0 * SIZE
+   LD a7,  X,   0 * SIZE
+   ST a8,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,   0 * SIZE
+   ST a4,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  Y,   0 * SIZE
+   ST a6,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  Y,   0 * SIZE
+   ST a8,  Y,   1 * SIZE
+   add.d   Y, Y, INCY
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   ST a1,  Y,   0 * SIZE
+   ST a2,  Y,   1 * SIZE
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S
new file mode 100644
index 000000000..81ac19fbd
--- /dev/null
+++ b/kernel/loongarch64/zdot.S
@@ -0,0 +1,330 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define b1     $f14
+#define b2     $f15
+#define b3     $f16
+#define b4     $f17
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+   LDINT   INCY,  0(INCY)
+#endif
+
+   MTC  s1, $r0
+   MOV s2, s1
+   MOV s3, s2
+   MOV s4, s3
+   slli.d  INCX, INCX, ZBASE_SHIFT
+   li.d  TEMP, 2 * SIZE
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bne INCY, TEMP, .L20
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   LD b2,  Y,   1 * SIZE
+   bge $r0,    I, .L14
+   .align 3
+
+.L13:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   2 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   3 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   2 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   3 * SIZE
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   4 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   5 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   4 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   5 * SIZE
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   6 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   7 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   6 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   7 * SIZE
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   8 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   9 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   8 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   9 * SIZE
+   addi.d  I, I, -1
+   addi.d  X, X, 8 * SIZE
+   addi.d Y, Y, 8 * SIZE
+   blt $r0,    I, .L13
+   .align 3
+
+.L14:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   2 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   3 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   2 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   3 * SIZE
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   4 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   5 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   4 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   5 * SIZE
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   6 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   7 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   6 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   7 * SIZE
+   MADD  s1, b3, a3, s1
+   addi.d  X, X, 8 * SIZE
+   MADD  s2, b3, a4, s2
+   addi.d  Y, Y, 8 * SIZE
+   MADD  s3, b4, a3, s3
+   MADD  s4, b4, a4, s4
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   addi.d  I, I, -1
+   LD b2,  Y,   1 * SIZE
+   bge $r0,    I, .L17
+   .align 3
+
+.L16:
+   MADD  s1, b1, a1, s1
+   addi.d  I, I, -1
+   MADD  s2, b1, a2, s2
+   LD b1,  Y,   2 * SIZE
+   MADD  s3, b2, a1, s3
+   LD a1,  X,   2 * SIZE
+   MADD  s4, b2, a2, s4
+   LD a2,  X,   3 * SIZE
+   LD b2,  Y,   3 * SIZE
+   addi.d  X, X, 2 * SIZE
+   addi.d Y, Y, 2 * SIZE
+   blt $r0,    I, .L16
+   .align 3
+
+.L17:
+   MADD  s1, b1, a1, s1
+   MADD  s2, b1, a2, s2
+   MADD  s3, b2, a1, s3
+   MADD  s4, b2, a2, s4
+   b   .L999
+   .align 3
+
+.L20:
+#ifdef F_INTERFACE
+   bgez    INCX, .L21
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCX
+   mflo    TEMP
+   dsub    X, X, TEMP
+   .align 3
+
+.L21:
+   bgez    INCY, .L22
+   addi.d  TEMP, N, -1
+   mult    TEMP, INCY
+   mflo    TEMP
+   dsub    Y, Y, TEMP
+   .align 3
+
+.L22:
+#endif
+   bge $r0,    I, .L25
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   bge $r0,    I, .L24
+   .align 3
+
+.L23:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   0 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   1 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   0 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   0 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   1 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   0 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L23
+   .align 3
+
+.L24:
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b3, a3, s1
+   LD a1,  X,   0 * SIZE
+   MADD  s2, b3, a4, s2
+   LD a2,  X,   1 * SIZE
+   MADD  s3, b4, a3, s3
+   LD b1,  Y,   0 * SIZE
+   MADD  s4, b4, a4, s4
+   LD b2,  Y,   1 * SIZE
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   MADD  s1, b1, a1, s1
+   LD a3,  X,   0 * SIZE
+   MADD  s2, b1, a2, s2
+   LD a4,  X,   1 * SIZE
+   MADD  s3, b2, a1, s3
+   LD b3,  Y,   0 * SIZE
+   MADD  s4, b2, a2, s4
+   LD b4,  Y,   1 * SIZE
+   MADD  s1, b3, a3, s1
+   add.d   X, X, INCX
+   MADD  s2, b3, a4, s2
+   add.d   Y, Y, INCY
+   MADD  s3, b4, a3, s3
+   MADD  s4, b4, a4, s4
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD b1,  Y,   0 * SIZE
+   LD b2,  Y,   1 * SIZE
+   MADD  s1, b1, a1, s1
+   MADD  s2, b1, a2, s2
+   MADD  s3, b2, a1, s3
+   MADD  s4, b2, a2, s4
+   add.d   X, X, INCX
+   add.d   Y, Y, INCY
+   addi.d  I, I, -1
+   blt $r0,    I, .L26
+   .align 3
+
+.L999:
+#ifndef CONJ
+   SUB $f0, s1, s4
+#else
+   ADD $f0, s1, s4
+#endif
+#ifndef CONJ
+   ADD $f1, s3, s2
+#else
+   SUB $f1, s3, s2
+#endif
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S
new file mode 100644
index 000000000..f9acb6cfc
--- /dev/null
+++ b/kernel/loongarch64/zgemm3m_kernel.S
@@ -0,0 +1,1359 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r11
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define CO5    $r25
+#define CO6    $r26
+#define CO7    $r27
+#define CO8    $r28
+
+#define a1     $f22
+#define a2     $f8
+#define a3     $f28
+#define a4     $f29
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f4
+#define c31    $f2
+#define c32    $f5
+#define c41    $f6
+#define c42    $f7
+#define c51    $f18
+#define c52    $f19
+#define c61    $f20
+#define c62    $f21
+#define c71    $f24
+#define c72    $f25
+#define c81    $f26
+#define c82    $f27
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -128
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+   fst.d  $f28,  $sp,  80
+   fst.d  $f29,  $sp,  88
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+   srai.d  J,  N, 3
+   bge $r0,    J, .L30
+.L10:
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   move    AO, A
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   CO5, CO4,    LDC
+   MOV c31, c11
+   add.d   CO6, CO5,    LDC
+   MOV c41, c11
+   add.d   CO7, CO6,    LDC
+   MOV c51, c11
+   add.d   CO8, CO7,    LDC
+   srai.d  I,  M, 1
+   add.d   C,   CO8,    LDC
+MOV    c61, c11
+   bge $r0,    I, .L20
+.L11:
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  K, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD a4,  AO,   2 * SIZE
+   MADD  c61, b2, a1, c61
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD a4,  AO,   6 * SIZE
+   MADD  c61, b2, a3, c61
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   MADD  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD  c71, b3, a1, c71
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a4, c51
+   MADD  c61, b2, a4, c61
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   MADD  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD  c71, b3, a3, c71
+   MADD  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a4, c21
+   MADD  c31, b3, a4, c31
+   MADD  c41, b4, a4, c41
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c71, b3, a4, c71
+   MADD  c81, b4, a4, c81
+   MADD  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+   andi    L,  K, 3
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   LD $f10,  CO2,  0 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   LD $f11,  CO2,  1 * SIZE
+   MADD  $f8, c11, ALPHA_I, $f8
+   LD $f12,  CO2,  2 * SIZE
+   MADD  $f23, c12, ALPHA_R, $f23
+   LD $f13,  CO2,  3 * SIZE
+   MADD  $f9, c12, ALPHA_I, $f9
+   MADD  $f10, c21, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c21, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c22, ALPHA_R, $f12
+   ST $f23,  CO1,   2 * SIZE
+   MADD  $f13, c22, ALPHA_I, $f13
+   ST $f9,  CO1,   3 * SIZE
+   LD $f22,  CO3,  0 * SIZE
+   LD $f8,  CO3,  1 * SIZE
+   LD $f23,  CO3,  2 * SIZE
+   LD $f9,  CO3,  3 * SIZE
+   ST $f10,  CO2,   0 * SIZE
+   ST $f11,  CO2,   1 * SIZE
+   ST $f12,  CO2,   2 * SIZE
+   ST $f13,  CO2,   3 * SIZE
+   LD $f10,  CO4,  0 * SIZE
+   LD $f11,  CO4,  1 * SIZE
+   LD $f12,  CO4,  2 * SIZE
+   LD $f13,  CO4,  3 * SIZE
+   MADD  $f22, c31, ALPHA_R, $f22
+   MADD  $f8, c31, ALPHA_I, $f8
+   MADD  $f23, c32, ALPHA_R, $f23
+   MADD  $f9, c32, ALPHA_I, $f9
+   MADD  $f10, c41, ALPHA_R, $f10
+   ST $f22,  CO3,   0 * SIZE
+   MADD  $f11, c41, ALPHA_I, $f11
+   ST $f8,  CO3,   1 * SIZE
+   MADD  $f12, c42, ALPHA_R, $f12
+   ST $f23,  CO3,   2 * SIZE
+   MADD  $f13, c42, ALPHA_I, $f13
+   ST $f9,  CO3,   3 * SIZE
+   LD $f22,  CO5,  0 * SIZE
+   LD $f8,  CO5,  1 * SIZE
+   LD $f23,  CO5,  2 * SIZE
+   LD $f9,  CO5,  3 * SIZE
+   ST $f10,  CO4,   0 * SIZE
+   ST $f11,  CO4,   1 * SIZE
+   ST $f12,  CO4,   2 * SIZE
+   ST $f13,  CO4,   3 * SIZE
+   LD $f10,  CO6,  0 * SIZE
+   LD $f11,  CO6,  1 * SIZE
+   LD $f12,  CO6,  2 * SIZE
+   LD $f13,  CO6,  3 * SIZE
+   MADD  $f22, c51, ALPHA_R, $f22
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f8, c51, ALPHA_I, $f8
+   addi.d  CO2,CO2, 4 * SIZE
+   MADD  $f23, c52, ALPHA_R, $f23
+   addi.d  CO3,CO3, 4 * SIZE
+   MADD  $f9, c52, ALPHA_I, $f9
+   addi.d  CO4,CO4, 4 * SIZE
+   MADD  $f10, c61, ALPHA_R, $f10
+   ST $f22,  CO5,   0 * SIZE
+   MADD  $f11, c61, ALPHA_I, $f11
+   ST $f8,  CO5,   1 * SIZE
+   MADD  $f12, c62, ALPHA_R, $f12
+   ST $f23,  CO5,   2 * SIZE
+   MADD  $f13, c62, ALPHA_I, $f13
+   ST $f9,  CO5,   3 * SIZE
+   LD $f22,  CO7,  0 * SIZE
+   LD $f8,  CO7,  1 * SIZE
+   LD $f23,  CO7,  2 * SIZE
+   LD $f9,  CO7,  3 * SIZE
+   ST $f10,  CO6,   0 * SIZE
+   ST $f11,  CO6,   1 * SIZE
+   ST $f12,  CO6,   2 * SIZE
+   ST $f13,  CO6,   3 * SIZE
+   LD $f10,  CO8,  0 * SIZE
+   addi.d  I, I, -1
+   LD $f11,  CO8,  1 * SIZE
+MTC  c11, $r0
+   LD $f12,  CO8,  2 * SIZE
+   LD $f13,  CO8,  3 * SIZE
+   MADD  $f22, c71, ALPHA_R, $f22
+   addi.d  CO5,CO5, 4 * SIZE
+   MADD  $f8, c71, ALPHA_I, $f8
+   addi.d  CO6,CO6, 4 * SIZE
+   MADD  $f23, c72, ALPHA_R, $f23
+   addi.d  CO7,CO7, 4 * SIZE
+   MADD  $f9, c72, ALPHA_I, $f9
+   addi.d  CO8,CO8, 4 * SIZE
+   MADD  $f10, c81, ALPHA_R, $f10
+   ST $f22,  CO7,  -4 * SIZE
+   MADD  $f11, c81, ALPHA_I, $f11
+   ST $f8,  CO7,  -3 * SIZE
+   MADD  $f12, c82, ALPHA_R, $f12
+   ST $f23,  CO7,  -2 * SIZE
+   MADD  $f13, c82, ALPHA_I, $f13
+   ST $f9,  CO7,  -1 * SIZE
+   ST $f10,  CO8,  -4 * SIZE
+   MOV c21, c11
+   ST $f11,  CO8,  -3 * SIZE
+   MOV c31, c11
+   ST $f12,  CO8,  -2 * SIZE
+   MOV c41, c11
+   ST $f13,  CO8,  -1 * SIZE
+   MOV c51, c11
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L20:
+   andi    I,  M, 1
+   MOV c61, c11
+MOV    c71, c11
+   bge $r0,    I, .L29
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+   MOV c81, c11
+move   BO,  B
+   bge $r0,    L, .L25
+   .align  3
+.L22:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,  20 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   9 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,  10 * SIZE
+   MADD  c81, b4, a1, c81
+   LD b4,  BO,  11 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   MADD  c51, b7, a2, c51
+   LD b7,  BO,  28 * SIZE
+   MADD  c61, b2, a2, c61
+   LD b2,  BO,  17 * SIZE
+   MADD  c71, b3, a2, c71
+   LD b3,  BO,  18 * SIZE
+   MADD  c81, b4, a2, c81
+   LD b4,  BO,  19 * SIZE
+   LD a2,  AO,   5 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,  32 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,  21 * SIZE
+   MADD  c31, b3, a3, c31
+   LD b3,  BO,  22 * SIZE
+   MADD  c41, b4, a3, c41
+   LD b4,  BO,  23 * SIZE
+   MADD  c51, b5, a3, c51
+   LD b5,  BO,  36 * SIZE
+   MADD  c61, b2, a3, c61
+   LD b2,  BO,  25 * SIZE
+   MADD  c71, b3, a3, c71
+   LD b3,  BO,  26 * SIZE
+   MADD  c81, b4, a3, c81
+   LD b4,  BO,  27 * SIZE
+   LD a3,  AO,   2 * SIZE
+   addi.d  BO, BO, 32 * SIZE
+   MADD  c11, b6, a4, c11
+   LD b6,  BO,   8 * SIZE
+   MADD  c21, b2, a4, c21
+   LD b2,  BO,  -3 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  -2 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  -1 * SIZE
+   MADD  c51, b7, a4, c51
+   LD b7,  BO,  12 * SIZE
+   MADD  c61, b2, a4, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a4, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a4, c81
+   LD b4,  BO,   3 * SIZE
+   LD a4,  AO,   3 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+   andi    L,  K, 3
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   MOV a2, a2
+   addi.d  AO, AO,  1 * SIZE
+   addi.d  BO, BO,  8 * SIZE
+   MADD  c51, b5, a1, c51
+   LD b5,  BO,   4 * SIZE
+   MADD  c61, b2, a1, c61
+   LD b2,  BO,   1 * SIZE
+   MADD  c71, b3, a1, c71
+   LD b3,  BO,   2 * SIZE
+   MADD  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L26
+.L28:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   LD $f10,  CO3,  0 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   LD $f11,  CO3,  1 * SIZE
+   MADD  $f8, c11, ALPHA_I, $f8
+   LD $f12,  CO4,  0 * SIZE
+   MADD  $f23, c21, ALPHA_R, $f23
+   LD $f13,  CO4,  1 * SIZE
+   MADD  $f9, c21, ALPHA_I, $f9
+   MADD  $f10, c31, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c31, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c41, ALPHA_R, $f12
+   ST $f23,  CO2,   0 * SIZE
+   MADD  $f13, c41, ALPHA_I, $f13
+   ST $f9,  CO2,   1 * SIZE
+   LD $f22,  CO5,  0 * SIZE
+   LD $f8,  CO5,  1 * SIZE
+   LD $f23,  CO6,  0 * SIZE
+   LD $f9,  CO6,  1 * SIZE
+   ST $f10,  CO3,   0 * SIZE
+   ST $f11,  CO3,   1 * SIZE
+   ST $f12,  CO4,   0 * SIZE
+   ST $f13,  CO4,   1 * SIZE
+   LD $f10,  CO7,  0 * SIZE
+   MADD  $f22, c51, ALPHA_R, $f22
+   LD $f11,  CO7,  1 * SIZE
+   MADD  $f8, c51, ALPHA_I, $f8
+   LD $f12,  CO8,  0 * SIZE
+   MADD  $f23, c61, ALPHA_R, $f23
+   LD $f13,  CO8,  1 * SIZE
+   MADD  $f9, c61, ALPHA_I, $f9
+   MADD  $f10, c71, ALPHA_R, $f10
+   ST $f22,  CO5,   0 * SIZE
+   MADD  $f11, c71, ALPHA_I, $f11
+   ST $f8,  CO5,   1 * SIZE
+   MADD  $f12, c81, ALPHA_R, $f12
+   ST $f23,  CO6,   0 * SIZE
+   MADD  $f13, c81, ALPHA_I, $f13
+   ST $f9,  CO6,   1 * SIZE
+   ST $f10,  CO7,   0 * SIZE
+   ST $f11,  CO7,   1 * SIZE
+   ST $f12,  CO8,   0 * SIZE
+   ST $f13,  CO8,   1 * SIZE
+   .align 3
+
+.L29:
+move   B, BO
+   blt $r0,    J, .L10
+   .align 3
+
+.L30:
+   andi    J,  N, 4
+move   AO, A
+   bge $r0,    J, .L50
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   add.d   C,   CO4,    LDC
+   MOV c31, c11
+   srai.d  I,  M, 1
+MOV    c41, c11
+   bge $r0,    I, .L40
+.L31:
+   LD a1,  AO,   0 * SIZE
+   LD a3,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   MOV c32, c11
+   LD b4,  B,   3 * SIZE
+   MOV c42, c11
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L35
+   .align  3
+.L32:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD  c21, b2, a1, c21
+   MADD  c31, b3, a1, c31
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD  c11, b6, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD  c21, b2, a3, c21
+   MADD  c31, b3, a3, c31
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD  c11, b7, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD  c31, b3, a3, c31
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD  c12, b7, a2, c12
+   LD b7,  BO,  12 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+   andi    L,  K, 3
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD  c31, b3, a1, c31
+   addi.d  AO, AO,  2 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   0 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   4 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L36
+.L38:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   LD $f10,  CO2,  0 * SIZE
+   LD $f11,  CO2,  1 * SIZE
+   LD $f12,  CO2,  2 * SIZE
+   LD $f13,  CO2,  3 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   MADD  $f23, c12, ALPHA_R, $f23
+   MADD  $f9, c12, ALPHA_I, $f9
+   MADD  $f10, c21, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c21, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c22, ALPHA_R, $f12
+   ST $f23,  CO1,   2 * SIZE
+   MADD  $f13, c22, ALPHA_I, $f13
+   ST $f9,  CO1,   3 * SIZE
+   LD $f22,  CO3,  0 * SIZE
+   LD $f8,  CO3,  1 * SIZE
+   LD $f23,  CO3,  2 * SIZE
+   LD $f9,  CO3,  3 * SIZE
+   ST $f10,  CO2,   0 * SIZE
+   MADD  $f22, c31, ALPHA_R, $f22
+   ST $f11,  CO2,   1 * SIZE
+   MADD  $f8, c31, ALPHA_I, $f8
+   ST $f12,  CO2,   2 * SIZE
+   MADD  $f23, c32, ALPHA_R, $f23
+   ST $f13,  CO2,   3 * SIZE
+   MADD  $f9, c32, ALPHA_I, $f9
+   LD $f10,  CO4,  0 * SIZE
+   LD $f11,  CO4,  1 * SIZE
+   LD $f12,  CO4,  2 * SIZE
+   LD $f13,  CO4,  3 * SIZE
+   MADD  $f10, c41, ALPHA_R, $f10
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f11, c41, ALPHA_I, $f11
+   addi.d  CO2,CO2, 4 * SIZE
+   MADD  $f12, c42, ALPHA_R, $f12
+   addi.d  CO3,CO3, 4 * SIZE
+   MADD  $f13, c42, ALPHA_I, $f13
+   addi.d  CO4,CO4, 4 * SIZE
+   ST $f22,  CO3,  -4 * SIZE
+   addi.d  I, I, -1
+   ST $f8,  CO3,  -3 * SIZE
+   ST $f23,  CO3,  -2 * SIZE
+   ST $f9,  CO3,  -1 * SIZE
+   ST $f10,  CO4,  -4 * SIZE
+MTC  c11, $r0
+   ST $f11,  CO4,  -3 * SIZE
+   MOV c21, c11
+   ST $f12,  CO4,  -2 * SIZE
+   MOV c31, c11
+   ST $f13,  CO4,  -1 * SIZE
+MOV    c41, c11
+   blt $r0,    I, .L31
+   .align 3
+
+.L40:
+   andi    I,  M, 1
+MOV    c61, c11
+   bge $r0,    I, .L49
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c81, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L45
+   .align  3
+.L42:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,  16 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   addi.d  L, L, -1
+   MADD  c11, b5, a2, c11
+   LD b5,  BO,  20 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  11 * SIZE
+   LD a2,  AO,   2 * SIZE
+   addi.d  AO, AO,  4 * SIZE
+   MADD  c11, b6, a2, c11
+   LD b6,  BO,  24 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,  13 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,  14 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,  15 * SIZE
+   LD a2,  AO,  -1 * SIZE
+   addi.d  BO, BO, 16 * SIZE
+   MADD  c11, b7, a2, c11
+   LD b7,  BO,  12 * SIZE
+   MADD  c21, b2, a2, c21
+   LD b2,  BO,   1 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   2 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   3 * SIZE
+   LD a2,  AO,   1 * SIZE
+   blt $r0,    L, .L42
+   .align 3
+
+.L45:
+   andi    L,  K, 3
+   bge $r0,    L, .L48
+   .align  3
+.L46:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a1, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a1, c41
+   LD a1,  AO,   1 * SIZE
+   LD b4,  BO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+   MOV a2, a2
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L46
+.L48:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   LD $f10,  CO3,  0 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   LD $f11,  CO3,  1 * SIZE
+   MADD  $f8, c11, ALPHA_I, $f8
+   LD $f12,  CO4,  0 * SIZE
+   MADD  $f23, c21, ALPHA_R, $f23
+   LD $f13,  CO4,  1 * SIZE
+   MADD  $f9, c21, ALPHA_I, $f9
+   MADD  $f10, c31, ALPHA_R, $f10
+   ST $f22,  CO1,   0 * SIZE
+   MADD  $f11, c31, ALPHA_I, $f11
+   ST $f8,  CO1,   1 * SIZE
+   MADD  $f12, c41, ALPHA_R, $f12
+   ST $f23,  CO2,   0 * SIZE
+   MADD  $f13, c41, ALPHA_I, $f13
+   ST $f9,  CO2,   1 * SIZE
+   ST $f10,  CO3,   0 * SIZE
+   ST $f11,  CO3,   1 * SIZE
+   ST $f12,  CO4,   0 * SIZE
+   ST $f13,  CO4,   1 * SIZE
+   .align 3
+
+.L49:
+   move    B, BO
+   .align 3
+
+.L50:
+   andi    J,  N, 2
+move   AO, A
+   bge $r0,    J, .L70
+   move    CO1, C
+   add.d   CO2, C,      LDC
+   srai.d  I,  M, 1
+add.d  C,   CO2,    LDC
+   bge $r0,    I, .L60
+.L51:
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L55
+   .align  3
+.L52:
+   MADD  c11, b1, a1, c11
+   LD a3,  AO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b4,  BO,   3 * SIZE
+   MADD  c12, b1, a2, c12
+   LD a4,  AO,   3 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b1,  BO,   8 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a1,  AO,   8 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   5 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,   6 * SIZE
+   MADD  c11, b5, a5, c11
+   LD a3,  AO,   6 * SIZE
+   MADD  c21, b2, a5, c21
+   LD b4,  BO,   7 * SIZE
+   MADD  c12, b5, a2, c12
+   LD a4,  AO,   7 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b5,  BO,  12 * SIZE
+   MADD  c11, b3, a3, c11
+   LD a5,  AO,  12 * SIZE
+   MADD  c21, b4, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c12, b3, a4, c12
+   LD a2,  AO,   9 * SIZE
+   MADD  c22, b4, a4, c22
+   LD b3,  BO,  10 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L52
+   .align 3
+
+.L55:
+   andi    L,  K, 3
+   bge $r0,    L, .L58
+   .align  3
+.L56:
+   MADD  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD  c22, b2, a2, c22
+   LD b2,  BO,   3 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L56
+.L58:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   LD $f10,  CO2,  0 * SIZE
+   LD $f11,  CO2,  1 * SIZE
+   LD $f12,  CO2,  2 * SIZE
+   LD $f13,  CO2,  3 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   addi.d  I, I, -1
+   MADD  $f8, c11, ALPHA_I, $f8
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f23, c12, ALPHA_R, $f23
+   addi.d  CO2,CO2, 4 * SIZE
+   MADD  $f9, c12, ALPHA_I, $f9
+   MADD  $f10, c21, ALPHA_R, $f10
+   MADD  $f11, c21, ALPHA_I, $f11
+   MADD  $f12, c22, ALPHA_R, $f12
+   MADD  $f13, c22, ALPHA_I, $f13
+   ST $f22,  CO1,  -4 * SIZE
+   ST $f8,  CO1,  -3 * SIZE
+   ST $f23,  CO1,  -2 * SIZE
+   ST $f9,  CO1,  -1 * SIZE
+   ST $f10,  CO2,  -4 * SIZE
+   ST $f11,  CO2,  -3 * SIZE
+   ST $f12,  CO2,  -2 * SIZE
+   ST $f13,  CO2,  -1 * SIZE
+   blt $r0,    I, .L51
+   .align 3
+
+.L60:
+   andi    I,  M, 1
+   bge $r0,    I, .L69
+   srai.d  L,  K, 2
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   MOV c31, c11
+   LD a4,  AO,   3 * SIZE
+   MOV c41, c11
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L65
+   .align  3
+.L62:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   4 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   5 * SIZE
+   MADD  c31, b3, a2, c31
+   LD b3,  BO,   6 * SIZE
+   MADD  c41, b4, a2, c41
+   LD b4,  BO,   7 * SIZE
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   MADD  c11, b1, a3, c11
+   LD b1,  BO,   8 * SIZE
+   MADD  c21, b2, a3, c21
+   LD b2,  BO,   9 * SIZE
+   MADD  c31, b3, a4, c31
+   LD b3,  BO,  10 * SIZE
+   MADD  c41, b4, a4, c41
+   LD b4,  BO,  11 * SIZE
+   LD a3,  AO,   6 * SIZE
+   LD a4,  AO,   7 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L62
+   .align 3
+
+.L65:
+   andi    L,  K, 3
+   bge $r0,    L, .L68
+   .align  3
+.L66:
+   MADD  c11, b1, a1, c11
+   LD b1,  BO,   2 * SIZE
+   MADD  c21, b2, a1, c21
+   LD b2,  BO,   3 * SIZE
+   LD a1,  AO,   1 * SIZE
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  2 * SIZE
+   blt $r0,    L, .L66
+.L68:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO2,  0 * SIZE
+   LD $f9,  CO2,  1 * SIZE
+   ADD c11, c11, c31
+   ADD c21, c21, c41
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   MADD  $f23, c21, ALPHA_R, $f23
+   MADD  $f9, c21, ALPHA_I, $f9
+   ST $f22,  CO1,   0 * SIZE
+   ST $f8,  CO1,   1 * SIZE
+   ST $f23,  CO2,   0 * SIZE
+   ST $f9,  CO2,   1 * SIZE
+   .align 3
+
+.L69:
+   move    B, BO
+   .align 3
+
+.L70:
+   andi    J,  N, 1
+move   AO, A
+   bge $r0,    J, .L999
+   move    CO1, C
+   srai.d  I,  M, 1
+add.d  C,   CO1,    LDC
+   bge $r0,    I, .L80
+.L71:
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a5,  AO,   4 * SIZE
+   LD b1,  B,   0 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   LD b3,  B,   2 * SIZE
+   LD b5,  B,   4 * SIZE
+   srai.d  L,  K, 2
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+move   BO,  B
+   bge $r0,    L, .L75
+   .align  3
+.L72:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   2 * SIZE
+   LD a2,  AO,   3 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   4 * SIZE
+   LD a2,  AO,   5 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   LD a1,  AO,   6 * SIZE
+   LD a2,  AO,   7 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  8 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L72
+   .align 3
+
+.L75:
+   andi    L,  K, 3
+   bge $r0,    L, .L78
+   .align  3
+.L76:
+   LD a1,  AO,   0 * SIZE
+   LD a2,  AO,   1 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   MADD  c12, b1, a2, c12
+   addi.d  L, L, -1
+   addi.d  AO, AO,  2 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L76
+.L78:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   LD $f23,  CO1,  2 * SIZE
+   LD $f9,  CO1,  3 * SIZE
+   ADD c11, c11, c21
+   addi.d  I, I, -1
+   ADD c12, c12, c22
+   addi.d  CO1,CO1, 4 * SIZE
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   MADD  $f23, c12, ALPHA_R, $f23
+   MADD  $f9, c12, ALPHA_I, $f9
+   ST $f22,  CO1,  -4 * SIZE
+   ST $f8,  CO1,  -3 * SIZE
+   ST $f23,  CO1,  -2 * SIZE
+   ST $f9,  CO1,  -1 * SIZE
+   blt $r0,    I, .L71
+   .align 3
+
+.L80:
+   andi    I,  M, 1
+   bge $r0,    I, .L89
+   LD a1,  AO,   0 * SIZE
+MTC  c11, $r0
+   LD a2,  AO,   1 * SIZE
+   MOV c21, c11
+   LD a3,  AO,   2 * SIZE
+   LD a4,  AO,   3 * SIZE
+   LD b1,  B,   0 * SIZE
+   LD b2,  B,   1 * SIZE
+   LD b3,  B,   2 * SIZE
+   LD b4,  B,   3 * SIZE
+   LD b5,  B,   4 * SIZE
+   LD b6,  B,   8 * SIZE
+   LD b7,  B,  12 * SIZE
+   srai.d  L,  K, 2
+move   BO,  B
+   bge $r0,    L, .L85
+   .align  3
+.L82:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   1 * SIZE
+   LD b1,  BO,   1 * SIZE
+   MADD  c21, b1, a1, c21
+   LD a1,  AO,   2 * SIZE
+   LD b1,  BO,   2 * SIZE
+   MADD  c11, b1, a1, c11
+   LD a1,  AO,   3 * SIZE
+   LD b1,  BO,   3 * SIZE
+   MADD  c21, b1, a1, c21
+   addi.d  L, L, -1
+   addi.d  AO, AO,  4 * SIZE
+addi.d BO, BO,  4 * SIZE
+   blt $r0,    L, .L82
+   .align 3
+
+.L85:
+   andi    L,  K, 3
+   bge $r0,    L, .L88
+   .align  3
+.L86:
+   LD a1,  AO,   0 * SIZE
+   LD b1,  BO,   0 * SIZE
+   MADD  c11, b1, a1, c11
+   addi.d  L, L, -1
+   addi.d  AO, AO,  1 * SIZE
+addi.d BO, BO,  1 * SIZE
+   blt $r0,    L, .L86
+.L88:
+   LD $f22,  CO1,  0 * SIZE
+   LD $f8,  CO1,  1 * SIZE
+   ADD c11, c11, c21
+   MADD  $f22, c11, ALPHA_R, $f22
+   MADD  $f8, c11, ALPHA_I, $f8
+   ST $f22,  CO1,   0 * SIZE
+   ST $f8,  CO1,   1 * SIZE
+   .align 3
+
+.L89:
+   move    B, BO
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+   fld.d  $f28,  $sp,  80
+   fld.d  $f29,  $sp,  88
+   addi.d  $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S
new file mode 100644
index 000000000..2d50d41a5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel.S
@@ -0,0 +1,1047 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r25
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+
+#if defined(TRMMKERNEL)
+#define OFFSET $r11
+#define KK     $r26
+#define TEMP   $r27
+#endif
+
+#define a1     $f22
+#define a2     $f8
+#define a3     $f28
+#define a4     $f29
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f3
+#define c22    $f4
+#define c31    $f2
+#define c32    $f5
+#define c41    $f6
+#define c42    $f7
+#define c51    $f18
+#define c52    $f19
+#define c61    $f20
+#define c62    $f21
+#define c71    $f24
+#define c72    $f25
+#define c81    $f26
+#define c82    $f27
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+   PROLOGUE
+
+   addi.d  $sp, $sp,   -128
+   SDARG  $r23,  $sp,  0
+   SDARG  $r24,  $sp,  8
+   SDARG  $r25,  $sp,  64
+   fst.d  $f24,  $sp,  16
+   fst.d  $f25,  $sp,  24
+   fst.d  $f26,  $sp,  32
+   fst.d  $f27,  $sp,  40
+   fst.d  $f28,  $sp,  48
+   fst.d  $f29,  $sp,  56
+#if defined(TRMMKERNEL)
+   SDARG  $r26,  $sp,   72
+   SDARG  $r27,  $sp,   80
+#endif
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  88
+   fst.d  $f19,  $sp,  96
+   fst.d  $f20,  $sp, 104
+   fst.d  $f21,  $sp, 112
+#endif
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   sub.d   KK, $r0, OFFSET
+#endif
+   srai.d  J,  N, 2
+nop
+   bge $r0,    J, .L20
+.L10:
+   move    CO1, C
+   MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   move    AO, A
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   MOV c31, c11
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   MOV c41, c11
+   MOV c51, c11
+   move    I,  M
+   add.d   C,   CO4,    LDC
+   MOV    c61, c11
+   bge $r0,    I, .L19
+.L11:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK,  ZBASE_SHIFT
+   slli.d  TEMP, KK, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 4
+#endif
+   srai.d  L,  TEMP, 2
+   bge $r0,    L, .L15
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  K, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#endif
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD3  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD1  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+#ifndef TRMMKERNEL
+   LD b1,  CO1,   0 * SIZE
+   ADD c11, c11, c22
+   LD b2,  CO1,   1 * SIZE
+   ADD c12, c12, c21
+   LD b3,  CO2,   0 * SIZE
+   ADD c31, c31, c42
+   LD b4,  CO2,   1 * SIZE
+   ADD c32, c32, c41
+   LD b5,  CO3,   0 * SIZE
+   ADD c51, c51, c62
+   LD b6,  CO3,   1 * SIZE
+   ADD c52, c52, c61
+   LD b7,  CO4,   0 * SIZE
+   ADD c71, c71, c82
+   LD b8,  CO4,   1 * SIZE
+   ADD c72, c72, c81
+   MADD  b1, c11, ALPHA_R, b1
+   addi.d  CO1,CO1, 2 * SIZE
+   MADD  b2, c12, ALPHA_R, b2
+   addi.d  CO2,CO2, 2 * SIZE
+   MADD  b3, c31, ALPHA_R, b3
+   addi.d  CO3,CO3, 2 * SIZE
+   MADD  b4, c32, ALPHA_R, b4
+   addi.d  CO4,CO4, 2 * SIZE
+   MADD  b5, c51, ALPHA_R, b5
+   addi.d  I, I, -1
+   MADD  b6, c52, ALPHA_R, b6
+   MADD  b7, c71, ALPHA_R, b7
+   MADD  b8, c72, ALPHA_R, b8
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   NMSUB  b5, c52, ALPHA_I, b5
+   ST b2,  CO1,  -1 * SIZE
+   MADD  b6, c51, ALPHA_I, b6
+   ST b3,  CO2,  -2 * SIZE
+   NMSUB  b7, c72, ALPHA_I, b7
+   ST b4,  CO2,  -1 * SIZE
+   MADD  b8, c71, ALPHA_I, b8
+   ST b5,  CO3,  -2 * SIZE
+   MOV c21, c11
+   ST b6,  CO3,  -1 * SIZE
+   MOV c31, c11
+   ST b7,  CO4,  -2 * SIZE
+   MOV c41, c11
+   ST b8,  CO4,  -1 * SIZE
+   MOV c51, c11
+#else
+   ADD c11, c11, c22
+   addi.d  CO1,CO1, 2 * SIZE
+   ADD c12, c12, c21
+   addi.d  CO2,CO2, 2 * SIZE
+   ADD c31, c31, c42
+   addi.d  CO3,CO3, 2 * SIZE
+   ADD c32, c32, c41
+   addi.d  CO4,CO4, 2 * SIZE
+   ADD c51, c51, c62
+   addi.d  I, I, -1
+   ADD c52, c52, c61
+   ADD c71, c71, c82
+   ADD c72, c72, c81
+   MUL b1, ALPHA_R, c11
+   MUL b2, ALPHA_R, c12
+   MUL b3, ALPHA_R, c31
+   MUL b4, ALPHA_R, c32
+   MUL b5, ALPHA_R, c51
+   MUL b6, ALPHA_R, c52
+   MUL b7, ALPHA_R, c71
+   MUL b8, ALPHA_R, c72
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   NMSUB  b5, c52, ALPHA_I, b5
+   ST b2,  CO1,  -1 * SIZE
+   MADD  b6, c51, ALPHA_I, b6
+   ST b3,  CO2,  -2 * SIZE
+   NMSUB  b7, c72, ALPHA_I, b7
+   ST b4,  CO2,  -1 * SIZE
+   MADD  b8, c71, ALPHA_I, b8
+   ST b5,  CO3,  -2 * SIZE
+   MOV c21, c11
+   ST b6,  CO3,  -1 * SIZE
+   MOV c31, c11
+   ST b7,  CO4,  -2 * SIZE
+   MOV c41, c11
+   ST b8,  CO4,  -1 * SIZE
+   MOV c51, c11
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -4
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L19:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 4
+#endif
+move   B, BO
+   blt $r0,    J, .L10
+   .align 3
+
+.L20:
+   andi    J,  N, 2
+   MTC  c11, $r0
+move   CO1, C
+   bge $r0,    J, .L30
+   add.d   CO2, C,      LDC
+   add.d   C,   CO2,    LDC
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   move    I,  M
+move   AO, A
+   bge $r0,    I, .L29
+   .align 3
+
+.L21:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  L,    KK,  ZBASE_SHIFT
+   slli.d  TEMP, KK, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   MOV c12, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c22, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c32, c11
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 2
+#endif
+   srai.d  L,  TEMP, 2
+MOV    c42, c11
+   bge $r0,    L, .L25
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   srai.d  L,  K, 2
+   LD b3,  B,   2 * SIZE
+   MOV c12, c11
+   LD b4,  B,   3 * SIZE
+   MOV c22, c11
+   LD b5,  B,   4 * SIZE
+   MOV c32, c11
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  12 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c11, b5, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  17 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  18 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  19 * SIZE
+addi.d BO, BO, 16 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   addi.d  BO, BO,  4 * SIZE
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   0 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L26
+.L28:
+#ifndef TRMMKERNEL
+   LD b1,  CO1,   0 * SIZE
+   ADD c11, c11, c22
+   LD b2,  CO1,   1 * SIZE
+   ADD c12, c12, c21
+   LD b3,  CO2,   0 * SIZE
+   ADD c31, c31, c42
+   LD b4,  CO2,   1 * SIZE
+   ADD c32, c32, c41
+   MADD  b1, c11, ALPHA_R, b1
+   addi.d  CO1,CO1, 2 * SIZE
+   MADD  b2, c12, ALPHA_R, b2
+   addi.d  CO2,CO2, 2 * SIZE
+   MADD  b3, c31, ALPHA_R, b3
+   addi.d  I, I, -1
+   MADD  b4, c32, ALPHA_R, b4
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   ST b3,  CO2,  -2 * SIZE
+#else
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+   MUL b1, ALPHA_R, c11
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL b2, ALPHA_R, c12
+   addi.d  CO2,CO2, 2 * SIZE
+   MUL b3, ALPHA_R, c31
+   addi.d  I, I, -1
+   MUL b4, ALPHA_R, c32
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   NMSUB  b3, c32, ALPHA_I, b3
+   MADD  b4, c31, ALPHA_I, b4
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   ST b3,  CO2,  -2 * SIZE
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -2
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+#endif
+   ST b4,  CO2,  -1 * SIZE
+   blt $r0,    I, .L21
+   .align 3
+
+.L29:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 2
+#endif
+   move    B, BO
+   .align 3
+
+.L30:
+   andi    J,  N, 1
+   MTC  c11, $r0
+move   CO1, C
+   bge $r0,    J, .L999
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+   move    KK, OFFSET
+#endif
+   move    I,  M
+   add.d   C,   CO1,    LDC
+move   AO, A
+   bge $r0,    I, .L39
+   .align 3
+
+.L31:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+   move    BO,  B
+#else
+   slli.d  TEMP, KK,  ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, B,  TEMP
+#endif
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c12, c11
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  BO,   4 * SIZE
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+   sub.d   TEMP, K, KK
+#elif defined(LEFT)
+   addi.d  TEMP, KK, 1
+#else
+   addi.d  TEMP, KK, 1
+#endif
+   srai.d  L,  TEMP, 2
+MOV    c42, c11
+   bge $r0,    L, .L35
+#else
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   MOV c12, c11
+   srai.d  L,  K, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  B,   4 * SIZE
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD1  c11, b1, a1, c11
+   LD b4,  BO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD b2,  BO,   5 * SIZE
+   MADD3  c21, b4, a1, c21
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   5 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b4,  BO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,   6 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   7 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b2,  BO,   9 * SIZE
+   MADD3  c21, b4, a3, c21
+   LD a3,  AO,  12 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,  12 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   9 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#ifndef TRMMKERNEL
+   andi    L,  K, 3
+#else
+   andi    L,  TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD1  c11, b1, a1, c11
+   addi.d  L, L, -1
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   LD b2,  BO,   3 * SIZE
+   addi.d  BO, BO,  2 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L36
+.L38:
+#ifndef TRMMKERNEL
+   LD b1,  CO1,   0 * SIZE
+   ADD c11, c11, c22
+   LD b2,  CO1,   1 * SIZE
+   ADD c12, c12, c21
+   MADD  b1, c11, ALPHA_R, b1
+   addi.d  CO1,CO1, 2 * SIZE
+   MADD  b2, c12, ALPHA_R, b2
+   addi.d  I, I, -1
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   blt $r0,    I, .L31
+#else
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   MUL b1, ALPHA_R, c11
+   addi.d  CO1,CO1, 2 * SIZE
+   MUL b2, ALPHA_R, c12
+   addi.d  I, I, -1
+   NMSUB  b1, c12, ALPHA_I, b1
+   MADD  b2, c11, ALPHA_I, b2
+   MTC  c11, $r0
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+    (!defined(LEFT) && !defined(TRANSA))
+   sub.d   TEMP, K, KK
+#ifdef LEFT
+   addi.d  TEMP, TEMP, -1
+#else
+   addi.d  TEMP, TEMP, -1
+#endif
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LEFT
+   addi.d  KK, KK, 1
+#endif
+   ST b1,  CO1,  -2 * SIZE
+   ST b2,  CO1,  -1 * SIZE
+   blt $r0,    I, .L31
+#endif
+   .align 3
+
+.L39:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   addi.d  KK, KK, 1
+#endif
+   move    B, BO
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,  0
+   LDARG  $r24,  $sp,  8
+   LDARG  $r25,  $sp,  64
+   fld.d  $f24,  $sp,  16
+   fld.d  $f25,  $sp,  24
+   fld.d  $f26,  $sp,  32
+   fld.d  $f27,  $sp,  40
+   fld.d  $f28,  $sp,  48
+   fld.d  $f29,  $sp,  56
+#if defined(TRMMKERNEL)
+   LDARG  $r26,  $sp,   72
+   LDARG  $r27,  $sp,   80
+#endif
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  88
+   fld.d  $f19,  $sp,  96
+   fld.d  $f20,  $sp, 104
+   fld.d  $f21,  $sp, 112
+#endif
+   addi.d $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   fmov.d $f1, $f23
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S
new file mode 100644
index 000000000..d995ce86b
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n.S
@@ -0,0 +1,648 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M          $r4
+#define N          $r5
+#define A          $r7
+#define LDA        $r8
+#define X          $r9
+#define INCX       $r10
+#define Y          $r11
+#define INCY       $r6
+#define BUFFER     $r17
+
+#define YORIG      $r18
+#define XX         $r12
+#define YY         $r13
+#define I          $r14
+#define J          $r15
+#define AO1        $r23
+#define AO2        $r24
+
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+#define a1         $f22
+#define a2         $f8
+#define a3         $f23
+#define a4         $f9
+#define a5         $f10
+#define a6         $f11
+#define a7         $f12
+#define a8         $f13
+#define x1         $f14
+#define x2         $f15
+#define x3         $f16
+#define x4         $f17
+#define y1         $f3
+#define y2         $f4
+#define y3         $f2
+#define y4         $f5
+#define t1         $f6
+#define t2         $f7
+#define t3         $f18
+#define t4         $f19
+#define t5         $f20
+#define t6         $f21
+#define t7         $f24
+#define t8         $f25
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        NMSUB
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        MADD
+#define    MADD4        NMSUB
+#endif
+#if  !defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        MADD
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        NMSUB
+#define    MADD4        NMSUB
+#endif
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifndef __64BIT__
+   addi.d  $sp, $sp, -64
+#else
+   addi.d  $sp, $sp, -32
+#endif
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   fst.d  $f24,  $sp,  16
+   fst.d  $f25,  $sp,  24
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  32
+   fst.d  $f19,  $sp,  40
+   fst.d  $f20,  $sp,  48
+   fst.d  $f21,  $sp,  56
+#endif
+   slli.d     LDA,     LDA,  ZBASE_SHIFT
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   li.d  I, 2 * SIZE
+   move   YORIG, Y
+   beq INCY, I, .L10
+   srai.d  I,  M, 2
+   move    YORIG, BUFFER
+   move    XX, Y
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   LD a3,  XX,  0 * SIZE
+   LD a4,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   LD a5,  XX,  0 * SIZE
+   LD a6,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   LD a7,  XX,  0 * SIZE
+   LD a8,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   addi.d  I, I, -1
+   addi.d  YY, YY, 8 * SIZE
+   ST a1,  YY,  -8 * SIZE
+   ST a2,  YY,  -7 * SIZE
+   ST a3,  YY,  -6 * SIZE
+   ST a4,  YY,  -5 * SIZE
+   ST a5,  YY,  -4 * SIZE
+   ST a6,  YY,  -3 * SIZE
+   ST a7,  YY,  -2 * SIZE
+   ST a8,  YY,  -1 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   add.d   XX, XX, INCY
+   addi.d  I, I, -1
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   addi.d YY, YY, 2 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   LD x1,  X,  0 * SIZE
+   LD x2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD x3,  X,  0 * SIZE
+   LD x4,  X,  1 * SIZE
+   add.d   X, X, INCX
+   MUL a1, ALPHA_R, x1
+   move    AO1, A
+   MUL a2, ALPHA_I, x1
+   add.d   AO2, A,      LDA
+   MUL a3, ALPHA_R, x3
+   add.d   A,   AO2,    LDA
+   MUL a4, ALPHA_I, x3
+#ifndef XCONJ
+   NMSUB  x1, x2, ALPHA_I, a1
+   MADD  x2, x2, ALPHA_R, a2
+   NMSUB  x3, x4, ALPHA_I, a3
+   MADD  x4, x4, ALPHA_R, a4
+#else
+   MADD  x1, x2, ALPHA_I, a1
+   MSUB  x2, x2, ALPHA_R, a2
+   MADD  x3, x4, ALPHA_I, a3
+   MSUB  x4, x4, ALPHA_R, a4
+#endif
+   srai.d  I,  M, 2
+   move   YY, YORIG
+   bge $r0,    I, .L15
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD a5,  AO2,  0 * SIZE
+   LD a6,  AO2,  1 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,   4 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,   4 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,   5 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,   6 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,   6 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,   5 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,   7 * SIZE
+   MADD4  t4, a4, x1, t4
+   LD a4,  AO1,   7 * SIZE
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   LD a5,  AO2,   4 * SIZE
+   MADD1  t3, a7, x3, t3
+   MADD2  t4, a7, x4, t4
+   LD a7,  AO2,   6 * SIZE
+   MADD3  t1, a6, x4, t1
+   MADD4  t2, a6, x3, t2
+   LD a6,  AO2,   5 * SIZE
+   MADD3  t3, a8, x4, t3
+   addi.d  I, I, -1
+   MADD4  t4, a8, x3, t4
+   LD a8,  AO2,   7 * SIZE
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD1  t5, a1, x1, y1
+   LD y1,  YY,   8 * SIZE
+   MADD2  t6, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD1  t7, a3, x1, y3
+   LD y2,  YY,   9 * SIZE
+   MADD2  t8, a3, x2, y4
+   LD a3,  AO1,  10 * SIZE
+   MADD3  t5, a2, x2, t5
+   LD y3,  YY,  10 * SIZE
+   MADD4  t6, a2, x1, t6
+   LD a2,  AO1,   9 * SIZE
+   MADD3  t7, a4, x2, t7
+   LD y4,  YY,  11 * SIZE
+   MADD4  t8, a4, x1, t8
+   LD a4,  AO1,  11 * SIZE
+   MADD1  t5, a5, x3, t5
+   ST t1,  YY,   0 * SIZE
+   MADD2  t6, a5, x4, t6
+   LD a5,  AO2,   8 * SIZE
+   MADD1  t7, a7, x3, t7
+   ST t2,  YY,   1 * SIZE
+   MADD2  t8, a7, x4, t8
+   LD a7,  AO2,  10 * SIZE
+   MADD3  t5, a6, x4, t5
+   ST t3,  YY,   2 * SIZE
+   MADD4  t6, a6, x3, t6
+   LD a6,  AO2,   9 * SIZE
+   MADD3  t7, a8, x4, t7
+   ST t4,  YY,   3 * SIZE
+   MADD4  t8, a8, x3, t8
+   LD a8,  AO2,  11 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,  12 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,  12 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,  13 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,  14 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,  14 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,  13 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,  15 * SIZE
+   MADD4  t4, a4, x1, t4
+   LD a4,  AO1,  15 * SIZE
+   MADD1  t1, a5, x3, t1
+   ST t5,  YY,   4 * SIZE
+   MADD2  t2, a5, x4, t2
+   LD a5,  AO2,  12 * SIZE
+   MADD1  t3, a7, x3, t3
+   ST t6,  YY,   5 * SIZE
+   MADD2  t4, a7, x4, t4
+   LD a7,  AO2,  14 * SIZE
+   MADD3  t1, a6, x4, t1
+   ST t7,  YY,   6 * SIZE
+   MADD4  t2, a6, x3, t2
+   LD a6,  AO2,  13 * SIZE
+   MADD3  t3, a8, x4, t3
+   ST t8,  YY,   7 * SIZE
+   MADD4  t4, a8, x3, t4
+   LD a8,  AO2,  15 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d  AO1, AO1,  8 * SIZE
+   addi.d AO2, AO2,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   ST t1,  YY,   0 * SIZE
+   MADD1  t1, a1, x1, y1
+   ST t2,  YY,   1 * SIZE
+   MADD2  t2, a1, x2, y2
+   ST t3,  YY,   2 * SIZE
+   MADD1  t3, a3, x1, y3
+   ST t4,  YY,   3 * SIZE
+   MADD2  t4, a3, x2, y4
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   MADD1  t3, a7, x3, t3
+   MADD2  t4, a7, x4, t4
+   MADD3  t1, a6, x4, t1
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD4  t2, a6, x3, t2
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD3  t3, a8, x4, t3
+   addi.d  YY,  YY,   8 * SIZE
+   MADD4  t4, a8, x3, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L15:
+   andi    I,  M, 2
+   bge $r0,    I, .L16
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD a5,  AO2,  0 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a6,  AO2,  1 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD a7,  AO2,  2 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a8,  AO2,  3 * SIZE
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   MADD1  t3, a7, x3, t3
+   MADD2  t4, a7, x4, t4
+   MADD3  t1, a6, x4, t1
+   addi.d  YY,  YY,   4 * SIZE
+   MADD4  t2, a6, x3, t2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD3  t3, a8, x4, t3
+   addi.d  AO2, AO2,  4 * SIZE
+   MADD4  t4, a8, x3, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L16:
+   andi    I,  M, 1
+   bge $r0,    I, .L19
+   LD y1,  YY,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD a5,  AO2,  0 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a6,  AO2,  1 * SIZE
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   MADD1  t1, a5, x3, t1
+   MADD2  t2, a5, x4, t2
+   MADD3  t1, a6, x4, t1
+   MADD4  t2, a6, x3, t2
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   .align 3
+
+.L19:
+   addi.d  J, J, -1
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   bge $r0,    J, .L900
+   LD x1,  X,  0 * SIZE
+   LD x2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   MUL a1, ALPHA_R, x1
+   move    AO1, A
+   MUL a2, ALPHA_I, x1
+#ifndef XCONJ
+   NMSUB  x1, x2, ALPHA_I, a1
+   MADD  x2, x2, ALPHA_R, a2
+#else
+   MADD  x1, x2, ALPHA_I, a1
+   MSUB  x2, x2, ALPHA_R, a2
+#endif
+   srai.d  I,  M, 2
+   move   YY, YORIG
+   bge $r0,    I, .L25
+   LD y1,  YY,  0 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y4,  YY,  3 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,   4 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,   4 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,   5 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,   6 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,   6 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,   5 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,   7 * SIZE
+   MADD4  t4, a4, x1, t4
+   addi.d  I, I, -1
+   LD a4,  AO1,   7 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD1  t5, a1, x1, y1
+   LD y1,  YY,   8 * SIZE
+   MADD2  t6, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD1  t7, a3, x1, y3
+   LD y2,  YY,   9 * SIZE
+   MADD2  t8, a3, x2, y4
+   LD a3,  AO1,  10 * SIZE
+   MADD3  t5, a2, x2, t5
+   LD y3,  YY,  10 * SIZE
+   MADD4  t6, a2, x1, t6
+   LD a2,  AO1,   9 * SIZE
+   MADD3  t7, a4, x2, t7
+   LD y4,  YY,  11 * SIZE
+   MADD4  t8, a4, x1, t8
+   LD a4,  AO1,  11 * SIZE
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   ST t3,  YY,   2 * SIZE
+   ST t4,  YY,   3 * SIZE
+   MADD1  t1, a1, x1, y1
+   LD y1,  YY,  12 * SIZE
+   MADD2  t2, a1, x2, y2
+   LD a1,  AO1,  12 * SIZE
+   MADD1  t3, a3, x1, y3
+   LD y2,  YY,  13 * SIZE
+   MADD2  t4, a3, x2, y4
+   LD a3,  AO1,  14 * SIZE
+   MADD3  t1, a2, x2, t1
+   LD y3,  YY,  14 * SIZE
+   MADD4  t2, a2, x1, t2
+   LD a2,  AO1,  13 * SIZE
+   MADD3  t3, a4, x2, t3
+   LD y4,  YY,  15 * SIZE
+   MADD4  t4, a4, x1, t4
+   LD a4,  AO1,  15 * SIZE
+   ST t5,  YY,   4 * SIZE
+   ST t6,  YY,   5 * SIZE
+   ST t7,  YY,   6 * SIZE
+   ST t8,  YY,   7 * SIZE
+   addi.d  I, I, -1
+   addi.d  YY,  YY,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   ST t1,  YY,   0 * SIZE
+   MADD1  t1, a1, x1, y1
+   ST t2,  YY,   1 * SIZE
+   MADD2  t2, a1, x2, y2
+   ST t3,  YY,   2 * SIZE
+   MADD1  t3, a3, x1, y3
+   ST t4,  YY,   3 * SIZE
+   MADD2  t4, a3, x2, y4
+   MADD3  t1, a2, x2, t1
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD4  t2, a2, x1, t2
+   addi.d  YY,  YY,   8 * SIZE
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 2
+   bge $r0,    I, .L26
+   LD a1,  AO1,  0 * SIZE
+   LD y1,  YY,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a3,  AO1,  2 * SIZE
+   LD y3,  YY,  2 * SIZE
+   LD a4,  AO1,  3 * SIZE
+   LD y4,  YY,  3 * SIZE
+   MADD1  t1, a1, x1, y1
+   MADD2  t2, a1, x2, y2
+   MADD1  t3, a3, x1, y3
+   MADD2  t4, a3, x2, y4
+   MADD3  t1, a2, x2, t1
+   addi.d  YY,  YY,   4 * SIZE
+   MADD4  t2, a2, x1, t2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD3  t3, a4, x2, t3
+   MADD4  t4, a4, x1, t4
+   ST t1,  YY,  -4 * SIZE
+   ST t2,  YY,  -3 * SIZE
+   ST t3,  YY,  -2 * SIZE
+   ST t4,  YY,  -1 * SIZE
+   .align 3
+
+.L26:
+   andi    I,  M, 1
+   bge $r0,    I, .L900
+   LD y1,  YY,  0 * SIZE
+   LD y2,  YY,  1 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   MADD1  t1, a1, x1, y1
+   MADD2  t2, a1, x2, y2
+   MADD3  t1, a2, x2, t1
+   MADD4  t2, a2, x1, t2
+   ST t1,  YY,   0 * SIZE
+   ST t2,  YY,   1 * SIZE
+   .align 3
+
+.L900:
+   li.d  YORIG, 2 * SIZE
+   srai.d I,  M, 2
+   beq INCY, YORIG, .L999
+   move   XX, BUFFER
+   bge $r0,    I, .L905
+   .align 3
+
+.L902:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   LD a3,  XX,  2 * SIZE
+   LD a4,  XX,  3 * SIZE
+   LD a5,  XX,  4 * SIZE
+   LD a6,  XX,  5 * SIZE
+   LD a7,  XX,  6 * SIZE
+   LD a8,  XX,  7 * SIZE
+   addi.d  I, I, -1
+   ST a1,  Y,  0 * SIZE
+   ST a2,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   ST a3,  Y,  0 * SIZE
+   ST a4,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   ST a5,  Y,  0 * SIZE
+   ST a6,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   ST a7,  Y,  0 * SIZE
+   ST a8,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   addi.d XX, XX, 8 * SIZE
+   blt $r0,    I, .L902
+   .align 3
+
+.L905:
+   andi    I,  M, 3
+   bge $r0,    I, .L999
+   .align 3
+
+.L906:
+   LD a1,  XX,  0 * SIZE
+   LD a2,  XX,  1 * SIZE
+   addi.d  XX, XX, 2 * SIZE
+   addi.d  I, I, -1
+   ST a1,  Y,  0 * SIZE
+   ST a2,  Y,  1 * SIZE
+   add.d  Y, Y, INCY
+   blt $r0,    I, .L906
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   fld.d  $f24,  $sp,  16
+   fld.d  $f25,  $sp,  24
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  32
+   fld.d  $f19,  $sp,  40
+   fld.d  $f20,  $sp,  48
+   fld.d  $f21,  $sp,  56
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 32
+#else
+   addi.d  $sp, $sp, 64
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S
new file mode 100644
index 000000000..841823e1c
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t.S
@@ -0,0 +1,556 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M          $r4
+#define N          $r5
+#define A          $r7
+#define LDA        $r8
+#define X          $r9
+#define INCX       $r10
+#define Y          $r11
+#define INCY       $r6
+#define BUFFER     $r17
+
+#define XORIG      $r18
+#define XX         $r12
+#define YY         $r13
+#define I          $r14
+#define J          $r15
+#define AO1        $r23
+#define AO2        $r24
+
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+#define a1         $f22
+#define a2         $f8
+#define a3         $f23
+#define a4         $f9
+#define a5         $f10
+#define a6         $f11
+#define a7         $f12
+#define a8         $f13
+#define y1         $f14
+#define y2         $f15
+#define y3         $f16
+#define y4         $f17
+#define x1         $f3
+#define x2         $f4
+#define x3         $f2
+#define x4         $f5
+#define x5         $f6
+#define x6         $f7
+#define x7         $f18
+#define x8         $f19
+
+#if !defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        NMSUB
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && !defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        MADD
+#define    MADD3        MADD
+#define    MADD4        NMSUB
+#endif
+#if  !defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        MADD
+#define    MADD4        MADD
+#endif
+#if  defined(CONJ) && defined(XCONJ)
+#define    MADD1        MADD
+#define    MADD2        NMSUB
+#define    MADD3        NMSUB
+#define    MADD4        NMSUB
+#endif
+
+   PROLOGUE
+
+   LDARG  INCY,    $sp,  0
+   LDARG  BUFFER,  $sp,  8
+#ifdef __64BIT__
+   addi.d  $sp, $sp, -16
+#else
+   addi.d  $sp, $sp, -32
+#endif
+   MTC  y1, $r0
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   slli.d     LDA,     LDA,  ZBASE_SHIFT
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  16
+   fst.d  $f19,  $sp,  24
+#endif
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    M, .L999
+   slli.d INCY, INCY, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   li.d  I, 2 * SIZE
+   move   XORIG, X
+   beq INCX, I, .L10
+   srai.d  I,  M, 2
+   move    XORIG, BUFFER
+   move   YY, BUFFER
+   bge $r0,    I, .L05
+   .align 3
+
+.L02:
+   LD a1,  X,  0 * SIZE
+   LD a2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,  0 * SIZE
+   LD a4,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,  0 * SIZE
+   LD a6,  X,  1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,  0 * SIZE
+   LD a8,  X,  1 * SIZE
+   add.d   X, X, INCX
+   addi.d  I, I, -1
+   addi.d  YY, YY, 8 * SIZE
+   ST a1,  YY,  -8 * SIZE
+   ST a2,  YY,  -7 * SIZE
+   ST a3,  YY,  -6 * SIZE
+   ST a4,  YY,  -5 * SIZE
+   ST a5,  YY,  -4 * SIZE
+   ST a6,  YY,  -3 * SIZE
+   ST a7,  YY,  -2 * SIZE
+   ST a8,  YY,  -1 * SIZE
+   blt $r0,    I, .L02
+   .align 3
+
+.L05:
+   andi    I,  M, 3
+   bge $r0,    I, .L10
+   .align 3
+
+.L06:
+   LD a1,  X,  0 * SIZE
+   LD a2,  X,  1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  YY,  0 * SIZE
+   ST a2,  YY,  1 * SIZE
+   addi.d  I, I, -1
+   addi.d YY, YY, 2 * SIZE
+   blt $r0,    I, .L06
+   .align 3
+
+.L10:
+   srai.d  J,  N, 1
+   move   YY, Y
+   bge $r0,    J, .L20
+   .align 3
+
+.L11:
+   move    AO1, A
+   MOV y2, y1
+   add.d   AO2, A,      LDA
+   MOV y3, y1
+   add.d   A,   AO2,    LDA
+   MOV y4, y1
+   srai.d  I,  M, 2
+   move   XX, XORIG
+   bge $r0,    I, .L15
+   LD x1,  XX,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a3,  AO2,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD a6,  AO1,  3 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   addi.d  I, I, -1
+   bge $r0,    I, .L13
+   .align  3
+.L12:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   LD a3,  AO2,  4 * SIZE
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   LD a2,  AO1,  5 * SIZE
+   MADD3  y3, a4, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a4, x1, y4
+   LD a4,  AO2,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   LD a7,  AO2,  6 * SIZE
+   MADD3  y1, a6, x4, y1
+   addi.d  I, I, -1
+   MADD4  y2, a6, x3, y2
+   LD a6,  AO1,  7 * SIZE
+   MADD3  y3, a8, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a8, x3, y4
+   LD a8,  AO2,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   LD a3,  AO2,   8 * SIZE
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   LD a2,  AO1,   9 * SIZE
+   MADD3  y3, a4, x2, y3
+   LD x2,  XX,   9 * SIZE
+   MADD4  y4, a4, x1, y4
+   LD a4,  AO2,   9 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,   8 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  10 * SIZE
+   MADD1  y3, a7, x3, y3
+   addi.d  XX,  XX,   8 * SIZE
+   MADD2  y4, a7, x4, y4
+   LD a7,  AO2,  10 * SIZE
+   MADD3  y1, a6, x4, y1
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD4  y2, a6, x3, y2
+   LD a6,  AO1,  11 * SIZE
+   MADD3  y3, a8, x4, y3
+   LD x4,  XX,   3 * SIZE
+   MADD4  y4, a8, x3, y4
+   LD a8,  AO2,   3 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,   2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   LD a3,  AO2,  4 * SIZE
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   LD a2,  AO1,  5 * SIZE
+   MADD3  y3, a4, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a4, x1, y4
+   LD a4,  AO2,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   LD a7,  AO2,  6 * SIZE
+   MADD3  y1, a6, x4, y1
+   MADD4  y2, a6, x3, y2
+   LD a6,  AO1,  7 * SIZE
+   MADD3  y3, a8, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a8, x3, y4
+   LD a8,  AO2,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   MADD3  y3, a4, x2, y3
+   MADD4  y4, a4, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   MADD3  y1, a6, x4, y1
+   addi.d  XX,  XX,   8 * SIZE
+   MADD4  y2, a6, x3, y2
+   addi.d  AO1, AO1,  8 * SIZE
+   MADD3  y3, a8, x4, y3
+   addi.d  AO2, AO2,  8 * SIZE
+   MADD4  y4, a8, x3, y4
+   .align 3
+
+.L15:
+   andi    I,  M, 2
+   bge $r0,    I, .L17
+   LD x1,  XX,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD x3,  XX,  2 * SIZE
+   LD x4,  XX,  3 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a3,  AO2,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD a4,  AO2,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD a7,  AO2,  2 * SIZE
+   LD a6,  AO1,  3 * SIZE
+   LD a8,  AO2,  3 * SIZE
+   MADD1  y1, a1, x1, y1
+   MADD2  y2, a1, x2, y2
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   MADD3  y3, a4, x2, y3
+   MADD4  y4, a4, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD1  y3, a7, x3, y3
+   MADD2  y4, a7, x4, y4
+   MADD3  y1, a6, x4, y1
+   addi.d  XX,  XX,   4 * SIZE
+   MADD4  y2, a6, x3, y2
+   addi.d  AO1, AO1,  4 * SIZE
+   MADD3  y3, a8, x4, y3
+   addi.d  AO2, AO2,  4 * SIZE
+   MADD4  y4, a8, x3, y4
+   .align 3
+
+.L17:
+   andi    I,  M, 1
+.align 3
+
+   bge $r0,    I, .L19
+.L18:
+   LD x1,  XX,  0 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a1,  AO1,  0 * SIZE
+   LD a3,  AO2,  0 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD a2,  AO1,  1 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a4,  AO2,  1 * SIZE
+   MADD1  y3, a3, x1, y3
+   MADD2  y4, a3, x2, y4
+   MADD3  y1, a2, x2, y1
+   MADD4  y2, a2, x1, y2
+   MADD3  y3, a4, x2, y3
+   MADD4  y4, a4, x1, y4
+   .align 3
+
+.L19:
+   LD a1,  Y,  0 * SIZE
+   LD a2,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   LD a3,  Y,  0 * SIZE
+   LD a4,  Y,  1 * SIZE
+   add.d   Y, Y, INCY
+   MADD  a1, y1, ALPHA_R, a1
+   MADD  a2, y1, ALPHA_I, a2
+   MADD  a3, y3, ALPHA_R, a3
+   MADD  a4, y3, ALPHA_I, a4
+   NMSUB  a1, y2, ALPHA_I, a1
+   MADD  a2, y2, ALPHA_R, a2
+   NMSUB  a3, y4, ALPHA_I, a3
+   MTC  y1, $r0
+   MADD  a4, y4, ALPHA_R, a4
+   addi.d  J, J, -1
+   ST a1,  YY,   0 * SIZE
+   ST a2,  YY,   1 * SIZE
+   add.d   YY, YY, INCY
+   ST a3,  YY,   0 * SIZE
+   ST a4,  YY,   1 * SIZE
+   add.d  YY, YY, INCY
+   blt $r0,    J, .L11
+   .align 3
+
+.L20:
+   andi    J,  N, 1
+   MOV y2, y1
+   srai.d I,  M, 2
+   bge $r0,    J, .L999
+   MOV y3, y1
+   move    AO1, A
+   MOV y4, y1
+   move   XX, XORIG
+   bge $r0,    I, .L25
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   LD x4,  XX,  3 * SIZE
+   addi.d  I, I, -1
+   LD a6,  AO1,  3 * SIZE
+   bge $r0,    I, .L23
+   .align  3
+.L22:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a2, x1, y4
+   LD a2,  AO1,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD3  y3, a6, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a6, x3, y4
+   LD a6,  AO1,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,   8 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x2,  XX,   9 * SIZE
+   MADD4  y4, a2, x1, y4
+   LD a2,  AO1,   9 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,   8 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  10 * SIZE
+   MADD3  y3, a6, x4, y3
+   LD x4,  XX,  11 * SIZE
+   MADD4  y4, a6, x3, y4
+   LD a6,  AO1,  11 * SIZE
+   addi.d  I, I, -1
+   addi.d  XX,  XX,   8 * SIZE
+   addi.d AO1, AO1,  8 * SIZE
+   blt $r0,    I, .L22
+   .align 3
+
+.L23:
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,   2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a1,  AO1,  4 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x2,  XX,  5 * SIZE
+   MADD4  y4, a2, x1, y4
+   LD a2,  AO1,  5 * SIZE
+   MADD1  y1, a5, x3, y1
+   LD x1,  XX,  4 * SIZE
+   MADD2  y2, a5, x4, y2
+   LD a5,  AO1,  6 * SIZE
+   MADD3  y3, a6, x4, y3
+   LD x4,  XX,  7 * SIZE
+   MADD4  y4, a6, x3, y4
+   LD a6,  AO1,  7 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  6 * SIZE
+   MADD2  y2, a1, x2, y2
+   MADD3  y3, a2, x2, y3
+   MADD4  y4, a2, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD3  y3, a6, x4, y3
+   addi.d  XX,  XX,   8 * SIZE
+   MADD4  y4, a6, x3, y4
+   addi.d  AO1, AO1,  8 * SIZE
+   .align 3
+
+.L25:
+   andi    I,  M, 2
+   bge $r0,    I, .L27
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   LD a5,  AO1,  2 * SIZE
+   MADD1  y1, a1, x1, y1
+   LD x3,  XX,  2 * SIZE
+   MADD2  y2, a1, x2, y2
+   LD a6,  AO1,  3 * SIZE
+   MADD3  y3, a2, x2, y3
+   LD x4,  XX,  3 * SIZE
+   MADD4  y4, a2, x1, y4
+   MADD1  y1, a5, x3, y1
+   MADD2  y2, a5, x4, y2
+   MADD3  y3, a6, x4, y3
+   addi.d  XX,  XX,   4 * SIZE
+   MADD4  y4, a6, x3, y4
+   addi.d  AO1, AO1,  4 * SIZE
+   .align 3
+
+.L27:
+   andi    I,  M, 1
+.align 3
+
+   bge $r0,    I, .L29
+.L28:
+   LD a1,  AO1,  0 * SIZE
+   LD x1,  XX,  0 * SIZE
+   LD a2,  AO1,  1 * SIZE
+   LD x2,  XX,  1 * SIZE
+   MADD1  y1, a1, x1, y1
+   MADD2  y2, a1, x2, y2
+   MADD3  y3, a2, x2, y3
+   MADD4  y4, a2, x1, y4
+   .align 3
+
+.L29:
+   LD a1,  Y,  0 * SIZE
+   LD a2,  Y,  1 * SIZE
+   ADD y1, y1, y3
+   ADD y2, y2, y4
+   MADD  a1, y1, ALPHA_R, a1
+   MADD  a2, y1, ALPHA_I, a2
+   NMSUB  a1, y2, ALPHA_I, a1
+   MADD  a2, y2, ALPHA_R, a2
+   ST a1,  YY,   0 * SIZE
+   ST a2,  YY,   1 * SIZE
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  16
+   fld.d  $f19,  $sp,  24
+#endif
+#ifdef __64BIT__
+   addi.d  $sp, $sp, 16
+#else
+   addi.d  $sp, $sp, 32
+#endif
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S
new file mode 100644
index 000000000..49f640268
--- /dev/null
+++ b/kernel/loongarch64/znrm2.S
@@ -0,0 +1,304 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r7
+#define I      $r17
+#define TEMP   $r18
+#define a1     $f10
+#define a2     $f11
+#define a3     $f12
+#define a4     $f13
+#define a5     $f14
+#define a6     $f15
+#define a7     $f16
+#define a8     $f17
+#define t1     $f0
+#define t2     $f1
+#define t3     $f2
+#define t4     $f3
+#define s1     $f22
+#define s2     $f8
+#define s3     $f23
+#define s4     $f9
+#define ALPHA  $f4
+#define max    $f5
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   MTC  s1, $r0
+   bge $r0,    N, .L999
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    INCX, .L999
+   move    XX, X
+   MOV s2, s1
+   srai.d  I, N, 2
+   MOV s3, s1
+   MOV    s4, s1
+   bge $r0,    I, .L15
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a7,  X,   0 * SIZE
+   LD a8,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   bge $r0,    I, .L13
+   .align 3
+
+.L12:
+   FABS    t1, a1
+   LD a1,  X,   0 * SIZE
+   FABS    t2, a2
+   NOP
+   FABS    t3, a3
+   LD a2,  X,   1 * SIZE
+   FABS    t4, a4
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a3,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   NOP
+   CMPLT   $fcc2, s3, t3
+   LD a4,  X,   1 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   LD a5,  X,   0 * SIZE
+   FABS    t2, a6
+   NOP
+   FABS    t3, a7
+   LD a6,  X,   1 * SIZE
+   FABS    t4, a8
+   add.d   X, X, INCX
+   CMPLT   $fcc0, s1, t1
+   LD a7,  X,   0 * SIZE
+   CMPLT   $fcc1, s2, t2
+   NOP
+   CMPLT   $fcc2, s3, t3
+   LD a8,  X,   1 * SIZE
+   CMPLT   $fcc3, s4, t4
+   add.d   X, X, INCX
+   CMOVT  s1,  s1,  t1,  $fcc0
+   addi.d  I, I, -1
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   blt $r0,    I, .L12
+   .align 3
+
+.L13:
+   FABS    t1, a1
+   FABS    t2, a2
+   FABS    t3, a3
+   FABS    t4, a4
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   FABS    t1, a5
+   FABS    t2, a6
+   FABS    t3, a7
+   FABS    t4, a8
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMPLT   $fcc2, s3, t3
+   CMPLT   $fcc3, s4, t4
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   CMOVT  s3,  s3,  t3,  $fcc2
+   CMOVT  s4,  s4,  t4,  $fcc3
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L100
+   .align  3
+
+.L16:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   addi.d  I, I, -1
+   FABS    t1, a1
+   FABS    t2, a2
+   CMPLT   $fcc0, s1, t1
+   CMPLT   $fcc1, s2, t2
+   CMOVT  s1,  s1,  t1,  $fcc0
+   CMOVT  s2,  s2,  t2,  $fcc1
+   add.d  X, X, INCX
+   blt $r0,    I, .L16
+   .align 3
+
+.L100:
+   CMPLT   $fcc0, s1, s2
+   CMPLT   $fcc1, s3, s4
+   CMOVT  s1,  s1,  s2,  $fcc0
+   CMOVT  s3,  s3,  s4,  $fcc1
+   CMPLT   $fcc0, s1, s3
+   CMOVT  s1,  s1,  s3,  $fcc0
+   lu12i.w TEMP, 0x3f800
+   movgr2fr.d  a1,   $r0
+   movgr2fr.w  ALPHA, TEMP
+   CMPEQ   $fcc0, s1, a1
+   fcvt.d.s   ALPHA, ALPHA
+   bcnez   $fcc0, .L999
+   fdiv.d  ALPHA, ALPHA, s1
+   MOV max, s1
+   MOV s1, a1
+   MOV s2, a1
+   MOV s3, a1
+   MOV s4, a1
+   srai.d  I, N, 2
+   bge $r0,    I, .L105
+   LD a1,  XX,   0 * SIZE
+   LD a2,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   LD a3,  XX,   0 * SIZE
+   LD a4,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   LD a5,  XX,   0 * SIZE
+   LD a6,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   LD a7,  XX,   0 * SIZE
+   LD a8,  XX,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   bge $r0,    I, .L104
+   .align 3
+
+.L103:
+   MUL t1, ALPHA, a1
+   LD a1,  XX,   0 * SIZE
+   MUL t2, ALPHA, a2
+   addi.d  I, I, -1
+   MUL t3, ALPHA, a3
+   LD a2,  XX,   1 * SIZE
+   MUL t4, ALPHA, a4
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a3,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   NOP
+   MADD  s3, t3, t3, s3
+   LD a4,  XX,   1 * SIZE
+   MADD  s4, t4, t4, s4
+   add.d   XX, XX, INCX
+   MUL t1, ALPHA, a5
+   LD a5,  XX,   0 * SIZE
+   MUL t2, ALPHA, a6
+   NOP
+   MUL t3, ALPHA, a7
+   LD a6,  XX,   1 * SIZE
+   MUL t4, ALPHA, a8
+   add.d   XX, XX, INCX
+   MADD  s1, t1, t1, s1
+   LD a7,  XX,   0 * SIZE
+   MADD  s2, t2, t2, s2
+   LD a8,  XX,   1 * SIZE
+   MADD  s3, t3, t3, s3
+   add.d   XX, XX, INCX
+   MADD  s4, t4, t4, s4
+   blt $r0,    I, .L103
+   .align 3
+
+.L104:
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MUL t3, ALPHA, a3
+   MUL t4, ALPHA, a4
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   MUL t1, ALPHA, a5
+   MUL t2, ALPHA, a6
+   MUL t3, ALPHA, a7
+   MUL t4, ALPHA, a8
+   MADD  s1, t1, t1, s1
+   MADD  s2, t2, t2, s2
+   MADD  s3, t3, t3, s3
+   MADD  s4, t4, t4, s4
+   .align 3
+
+.L105:
+   andi    I,  N, 3
+   bge $r0,    I, .L998
+   .align  3
+
+.L106:
+   LD a1,  XX,   0 * SIZE
+   LD a2,  XX,   1 * SIZE
+   addi.d  I, I, -1
+   MUL t1, ALPHA, a1
+   MUL t2, ALPHA, a2
+   MADD  s1, t1, t1, s1
+   add.d   XX, XX, INCX
+   MADD  s2, t2, t2, s2
+   blt $r0,    I, .L106
+   .align 3
+
+.L998:
+   ADD s1, s1, s2
+   ADD s3, s3, s4
+   ADD s1, s1, s3
+   fsqrt.d s1, s1
+   move $r4, $r17
+   MUL $f0, max, s1
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S
new file mode 100644
index 000000000..a12e527a5
--- /dev/null
+++ b/kernel/loongarch64/zscal.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N          $r4
+#define X          $r7
+#define INCX       $r8
+#define I          $r17
+#define TEMP       $r18
+#define XX         $r5
+#define ALPHA_R    $f0
+#define ALPHA_I    $f1
+#define a1         $f22
+#define a2         $f8
+#define a3         $f23
+#define a4         $f9
+#define a5         $f10
+#define a6         $f11
+#define a7         $f12
+#define a8         $f13
+#define t1         $f14
+#define t2         $f15
+#define t3         $f16
+#define t4         $f17
+
+   PROLOGUE
+
+   li.d  TEMP, 2 * SIZE
+   MTC  a1, $r0
+   slli.d INCX, INCX, ZBASE_SHIFT
+   bge $r0,    N, .L999
+   CMPEQ   $fcc0, ALPHA_R, a1
+   CMPEQ   $fcc1, ALPHA_I, a1
+   bceqz   $fcc0, .L50
+   bceqz   $fcc1, .L50
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,    I, .L15
+   .align 3
+
+.L12:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   ST a1,  X,   2 * SIZE
+   ST a1,  X,   3 * SIZE
+   ST a1,  X,   4 * SIZE
+   ST a1,  X,   5 * SIZE
+   ST a1,  X,   6 * SIZE
+   ST a1,  X,   7 * SIZE
+   addi.w  I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L12
+   .align 3
+
+.L15:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L16:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   addi.d  I, I, -1
+   addi.d X, X, 2 * SIZE
+   blt $r0,    I, .L16
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L20:
+   srai.d  I, N, 2
+   bge $r0,    I, .L25
+   .align 3
+
+.L22:
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST a1,  X,   0 * SIZE
+   ST a1,  X,   1 * SIZE
+   addi.d  I, I, -1
+   add.d  X, X, INCX
+   blt $r0,    I, .L22
+   .align 3
+
+.L25:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L26:
+   ST a1,  X,   0 * SIZE
+   addi.d  I, I, -1
+   ST a1,  X,   1 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L26
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L50:
+   srai.d I, N, 2
+   bne INCX, TEMP, .L60
+   addi.d I, I, -1
+   blt I,  $r0, .L55
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   LD a3,  X,   2 * SIZE
+   LD a4,  X,   3 * SIZE
+   LD a5,  X,   4 * SIZE
+   LD a6,  X,   5 * SIZE
+   MUL t1, ALPHA_R, a1
+   LD a7,  X,   6 * SIZE
+   MUL t2, ALPHA_I, a1
+   LD a8,  X,   7 * SIZE
+   MUL t3, ALPHA_R, a3
+   MUL    t4, ALPHA_I, a3
+   bge $r0,    I, .L53
+   .align 3
+
+.L52:
+   NMSUB  t1, a2, ALPHA_I, t1
+   LD a1,  X,   8 * SIZE
+   MADD  t2, a2, ALPHA_R, t2
+   LD a2,  X,   9 * SIZE
+   NMSUB  t3, a4, ALPHA_I, t3
+   LD a3,  X,  10 * SIZE
+   MADD  t4, a4, ALPHA_R, t4
+   LD a4,  X,  11 * SIZE
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA_I, a7
+   NMSUB  t1, a6, ALPHA_I, t1
+   LD a5,  X,  12 * SIZE
+   MADD  t2, a6, ALPHA_R, t2
+   LD a6,  X,  13 * SIZE
+   NMSUB  t3, a8, ALPHA_I, t3
+   LD a7,  X,  14 * SIZE
+   MADD  t4, a8, ALPHA_R, t4
+   LD a8,  X,  15 * SIZE
+   ST t1,  X,   4 * SIZE
+   MUL t1, ALPHA_R, a1
+   ST t2,  X,   5 * SIZE
+   MUL t2, ALPHA_I, a1
+   ST t3,  X,   6 * SIZE
+   MUL t3, ALPHA_R, a3
+   ST t4,  X,   7 * SIZE
+   MUL t4, ALPHA_I, a3
+   addi.d  I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0,    I, .L52
+   .align 3
+
+.L53:
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   NMSUB  t3, a4, ALPHA_I, t3
+   MADD  t4, a4, ALPHA_R, t4
+   ST t1,  X,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  X,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   ST t3,  X,   2 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  X,   3 * SIZE
+   MUL t4, ALPHA_I, a7
+   NMSUB  t1, a6, ALPHA_I, t1
+   MADD  t2, a6, ALPHA_R, t2
+   NMSUB  t3, a8, ALPHA_I, t3
+   MADD  t4, a8, ALPHA_R, t4
+   ST t1,  X,   4 * SIZE
+   ST t2,  X,   5 * SIZE
+   ST t3,  X,   6 * SIZE
+   ST t4,  X,   7 * SIZE
+   addi.d  X, X, 8 * SIZE
+   .align 3
+
+.L55:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L56:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   MUL t1, ALPHA_R, a1
+   MUL t2, ALPHA_I, a1
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   addi.d  X, X, 2 * SIZE
+   addi.d  I, I, -1
+   ST t1,  X,  -2 * SIZE
+   ST t2,  X,  -1 * SIZE
+   blt $r0,    I, .L56
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   .align 3
+
+.L60:
+   srai.d  I, N, 2
+   move    XX, X
+   addi.d I, I, -1
+   blt I,  $r0, .L65
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a3,  X,   0 * SIZE
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   LD a5,  X,   0 * SIZE
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   MUL t1, ALPHA_R, a1
+   LD a7,  X,   0 * SIZE
+   MUL t2, ALPHA_I, a1
+   LD a8,  X,   1 * SIZE
+   MUL t3, ALPHA_R, a3
+   add.d   X, X, INCX
+   MUL    t4, ALPHA_I, a3
+   bge $r0,    I, .L63
+   .align 3
+
+.L62:
+   NMSUB  t1, a2, ALPHA_I, t1
+   LD a1,  X,   0 * SIZE
+   MADD  t2, a2, ALPHA_R, t2
+   LD a2,  X,   1 * SIZE
+   add.d   X, X, INCX
+   NMSUB  t3, a4, ALPHA_I, t3
+   LD a3,  X,   0 * SIZE
+   MADD  t4, a4, ALPHA_R, t4
+   LD a4,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  XX,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  XX,   1 * SIZE
+   MUL t4, ALPHA_I, a7
+   add.d   XX, XX, INCX
+   NMSUB  t1, a6, ALPHA_I, t1
+   LD a5,  X,   0 * SIZE
+   MADD  t2, a6, ALPHA_R, t2
+   LD a6,  X,   1 * SIZE
+   add.d   X, X, INCX
+   NMSUB  t3, a8, ALPHA_I, t3
+   LD a7,  X,   0 * SIZE
+   MADD  t4, a8, ALPHA_R, t4
+   LD a8,  X,   1 * SIZE
+   add.d   X, X, INCX
+   ST t1,  XX,   0 * SIZE
+   MUL t1, ALPHA_R, a1
+   ST t2,  XX,   1 * SIZE
+   MUL t2, ALPHA_I, a1
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   MUL t3, ALPHA_R, a3
+   ST t4,  XX,   1 * SIZE
+   MUL t4, ALPHA_I, a3
+   addi.d  I, I, -1
+   add.d  XX, XX, INCX
+   blt $r0,    I, .L62
+   .align 3
+
+.L63:
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   NMSUB  t3, a4, ALPHA_I, t3
+   MADD  t4, a4, ALPHA_R, t4
+   ST t1,  XX,   0 * SIZE
+   MUL t1, ALPHA_R, a5
+   ST t2,  XX,   1 * SIZE
+   MUL t2, ALPHA_I, a5
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   MUL t3, ALPHA_R, a7
+   ST t4,  XX,   1 * SIZE
+   MUL t4, ALPHA_I, a7
+   add.d   XX, XX, INCX
+   NMSUB  t1, a6, ALPHA_I, t1
+   MADD  t2, a6, ALPHA_R, t2
+   NMSUB  t3, a8, ALPHA_I, t3
+   MADD  t4, a8, ALPHA_R, t4
+   ST t1,  XX,   0 * SIZE
+   ST t2,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   ST t3,  XX,   0 * SIZE
+   ST t4,  XX,   1 * SIZE
+   add.d   XX, XX, INCX
+   .align 3
+
+.L65:
+   andi    I,  N, 3
+   bge $r0,    I, .L999
+   .align  3
+.L66:
+   LD a1,  X,   0 * SIZE
+   LD a2,  X,   1 * SIZE
+   MUL t1, ALPHA_R, a1
+   MUL t2, ALPHA_I, a1
+   NMSUB  t1, a2, ALPHA_I, t1
+   MADD  t2, a2, ALPHA_R, t2
+   addi.d  I, I, -1
+   ST t1,  X,   0 * SIZE
+   ST t2,  X,   1 * SIZE
+   add.d  X, X, INCX
+   blt $r0,    I, .L66
+   .align 3
+
+.L999:
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S
new file mode 100644
index 000000000..26b1230b8
--- /dev/null
+++ b/kernel/loongarch64/ztrsm_kernel_LT.S
@@ -0,0 +1,1344 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r25
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define KK     $r26
+#define TEMP   $r27
+#define AORIG  $r28
+#define a1     $f22
+#define a2     $f8
+#define a3     $f26
+#define a4     $f27
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f0
+#define c22    $f1
+#define c31    $f2
+#define c32    $f3
+#define c41    $f4
+#define c42    $f5
+#define c51    $f6
+#define c52    $f7
+#define c61    $f18
+#define c62    $f19
+#define c71    $f20
+#define c72    $f21
+#define c81    $f24
+#define c82    $f25
+
+#ifndef CONJ
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#define    MADD5       MSUB
+#define    MADD6       MADD
+#define    MADD7       NMSUB
+#define    MADD8       MADD
+#else
+#if defined(LN) || defined(LT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#else
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+#define    MADD5       MADD
+#define    MADD6       MSUB
+#define    MADD7       MADD
+#define    MADD8       NMSUB
+#endif
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -128
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  88
+   fst.d  $f19,  $sp,  96
+   fst.d  $f20,  $sp, 104
+   fst.d  $f21,  $sp, 112
+#endif
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, ZBASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   srai.d  J,  N, 2
+nop
+   bge $r0,    J, .L20
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+   move    I,  M
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L19
+   .align 3
+
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  TEMP, 2
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   bge $r0,    L, .L15
+#endif
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD3  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD1  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+   ADD c51, c51, c62
+   ADD c52, c52, c61
+   ADD c71, c71, c82
+   ADD c72, c72, c81
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+   MUL a1, b2, c52
+   MUL a2, b2, c51
+   MUL a3, b2, c72
+   MUL a4, b2, c71
+   MADD5  c51, c51, b1, a1
+   MADD6  c52, c52, b1, a2
+   MADD5  c71, c71, b1, a3
+   MADD6  c72, c72, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   NMSUB  c51, c11, b5, c51
+   MADD7  c52, c11, b6, c52
+   NMSUB  c71, c11, b7, c71
+   MADD7  c72, c11, b8, c72
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   MADD8  c51, c12, b6, c51
+   NMSUB  c52, c12, b5, c52
+   MADD8  c71, c12, b8, c71
+   NMSUB  c72, c12, b7, c72
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+   NMSUB  c51, c31, b5, c51
+   MADD7  c52, c31, b6, c52
+   NMSUB  c71, c31, b7, c71
+   MADD7  c72, c31, b8, c72
+   MADD8  c51, c32, b6, c51
+   NMSUB  c52, c32, b5, c52
+   MADD8  c71, c32, b8, c71
+   NMSUB  c72, c32, b7, c72
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL a1, b6, c52
+   MUL a2, b6, c51
+   MADD5  c51, c51, b5, a1
+   MADD6  c52, c52, b5, a2
+   NMSUB  c71, c51, b7, c71
+   MADD7  c72, c51, b8, c72
+   MADD8  c71, c52, b8, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL a1, b8, c72
+   MUL a2, b8, c71
+   MADD5  c71, c71, b7, a1
+   MADD6  c72, c72, b7, a2
+#endif
+#ifdef RT
+   LD b1,  BO,  30 * SIZE
+   LD b2,  BO,  31 * SIZE
+   LD b3,  BO,  28 * SIZE
+   LD b4,  BO,  29 * SIZE
+   LD b5,  BO,  26 * SIZE
+   LD b6,  BO,  27 * SIZE
+   LD b7,  BO,  24 * SIZE
+   LD b8,  BO,  25 * SIZE
+   MUL a1, b2, c72
+   MUL a2, b2, c71
+   MADD5  c71, c71, b1, a1
+   MADD6  c72, c72, b1, a2
+   NMSUB  c51, c71, b3, c51
+   MADD7  c52, c71, b4, c52
+   NMSUB  c31, c71, b5, c31
+   MADD7  c32, c71, b6, c32
+   NMSUB  c11, c71, b7, c11
+   MADD7  c12, c71, b8, c12
+   MADD8  c51, c72, b4, c51
+   NMSUB  c52, c72, b3, c52
+   MADD8  c31, c72, b6, c31
+   NMSUB  c32, c72, b5, c32
+   MADD8  c11, c72, b8, c11
+   NMSUB  c12, c72, b7, c12
+   LD b3,  BO,  20 * SIZE
+   LD b4,  BO,  21 * SIZE
+   LD b5,  BO,  18 * SIZE
+   LD b6,  BO,  19 * SIZE
+   LD b7,  BO,  16 * SIZE
+   LD b8,  BO,  17 * SIZE
+   MUL a1, b4, c52
+   MUL a2, b4, c51
+   MADD5  c51, c51, b3, a1
+   MADD6  c52, c52, b3, a2
+   NMSUB  c31, c51, b5, c31
+   MADD7  c32, c51, b6, c32
+   NMSUB  c11, c51, b7, c11
+   MADD7  c12, c51, b8, c12
+   MADD8  c31, c52, b6, c31
+   NMSUB  c32, c52, b5, c32
+   MADD8  c11, c52, b8, c11
+   NMSUB  c12, c52, b7, c12
+   LD b5,  BO,  10 * SIZE
+   LD b6,  BO,  11 * SIZE
+   LD b7,  BO,   8 * SIZE
+   LD b8,  BO,   9 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c52,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c72,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c52,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c72,  AO,   7 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+   addi.d  CO3,CO3, -2 * SIZE
+   addi.d  CO4,CO4, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+   ST c51,  CO3,   0 * SIZE
+   ST c52,  CO3,   1 * SIZE
+   ST c71,  CO4,   0 * SIZE
+   ST c72,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+MTC  c11, $r0
+   addi.d  I, I, -1
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L19:
+#ifdef LN
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L20:
+   andi    J,  N, 2
+   bge $r0,    J, .L30
+#ifdef RT
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+MTC  c11, $r0
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L29
+   .align 3
+
+.L21:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   srai.d  L,  KK, 2
+   LD b3,  B,   2 * SIZE
+   MOV c12, c11
+   LD b4,  B,   3 * SIZE
+   MOV c22, c11
+   LD b5,  B,   4 * SIZE
+   MOV c32, c11
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b3,  BO,   2 * SIZE
+   MOV c12, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c22, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c32, c11
+MOV    c42, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  12 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c11, b5, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  17 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  18 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  19 * SIZE
+addi.d BO, BO, 16 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   addi.d  BO, BO,  4 * SIZE
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   0 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L26
+.L28:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+#endif
+#ifdef RT
+   LD b5,  BO,   6 * SIZE
+   LD b6,  BO,   7 * SIZE
+   LD b7,  BO,   4 * SIZE
+   LD b8,  BO,   5 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L21
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L30:
+   andi    J,  N, 1
+   bge $r0,    J, .L999
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+MTC  c11, $r0
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L39
+   .align 3
+
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   MOV c12, c11
+   srai.d  L,  KK, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  B,   4 * SIZE
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c12, c11
+   srai.d  L, TEMP, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  BO,   4 * SIZE
+MOV    c42, c11
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD1  c11, b1, a1, c11
+   LD b4,  BO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD b2,  BO,   5 * SIZE
+   MADD3  c21, b4, a1, c21
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   5 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b4,  BO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,   6 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   7 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b2,  BO,   9 * SIZE
+   MADD3  c21, b4, a3, c21
+   LD a3,  AO,  12 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,  12 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   9 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD1  c11, b1, a1, c11
+   addi.d  L, L, -1
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   LD b2,  BO,   3 * SIZE
+   addi.d  BO, BO,  2 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L36
+.L38:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+#if defined(LN) || defined(RT)
+   addi.d  TEMP, KK, -1
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L31
+   .align 3
+
+.L39:
+#ifdef LN
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  88
+   fld.d  $f19,  $sp,  96
+   fld.d  $f20,  $sp, 104
+   fld.d  $f21,  $sp, 112
+#endif
+   addi.d  $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S
new file mode 100644
index 000000000..e9f04362d
--- /dev/null
+++ b/kernel/loongarch64/ztrsm_kernel_RT.S
@@ -0,0 +1,1343 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define M      $r4
+#define N      $r5
+#define K      $r6
+#define A      $r7
+#define B      $r8
+#define C      $r9
+#define LDC    $r10
+#define OFFSET $r11
+
+#define AO     $r12
+#define BO     $r13
+#define I      $r17
+#define J      $r18
+#define L      $r25
+#define CO1    $r14
+#define CO2    $r15
+#define CO3    $r23
+#define CO4    $r24
+#define KK     $r26
+#define TEMP   $r27
+#define AORIG  $r28
+#define a1     $f22
+#define a2     $f8
+#define a3     $f26
+#define a4     $f27
+#define b1     $f23
+#define b2     $f9
+#define b3     $f10
+#define b4     $f11
+#define b5     $f12
+#define b6     $f13
+#define b7     $f14
+#define b8     $f15
+#define a5     b8
+#define c11    $f16
+#define c12    $f17
+#define c21    $f0
+#define c22    $f1
+#define c31    $f2
+#define c32    $f3
+#define c41    $f4
+#define c42    $f5
+#define c51    $f6
+#define c52    $f7
+#define c61    $f18
+#define c62    $f19
+#define c71    $f20
+#define c72    $f21
+#define c81    $f24
+#define c82    $f25
+
+#ifndef CONJ
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#define    MADD5       MSUB
+#define    MADD6       MADD
+#define    MADD7       NMSUB
+#define    MADD8       MADD
+#else
+#if defined(LN) || defined(LT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#else
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+#define    MADD5       MADD
+#define    MADD6       MSUB
+#define    MADD7       MADD
+#define    MADD8       NMSUB
+#endif
+
+   PROLOGUE
+
+   addi.d  $sp, $sp, -128
+   SDARG  $r23,  $sp,    0
+   SDARG  $r24,  $sp,    8
+   SDARG  $r25,  $sp,   16
+   SDARG  $r26,  $sp,   24
+   SDARG  $r27,  $sp,   32
+   SDARG  $r28,  $sp,   40
+   fst.d  $f24,  $sp,  48
+   fst.d  $f25,  $sp,  56
+   fst.d  $f26,  $sp,  64
+   fst.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fst.d  $f18,  $sp,  88
+   fst.d  $f19,  $sp,  96
+   fst.d  $f20,  $sp, 104
+   fst.d  $f21,  $sp, 112
+#endif
+   slli.d     LDC,    LDC, ZBASE_SHIFT
+#ifdef LN
+        mul.w   TEMP, M, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   A, A, TEMP
+   slli.d  TEMP, M, ZBASE_SHIFT
+   add.d   C, C, TEMP
+#endif
+#ifdef RN
+        sub.d   KK, $r0, OFFSET
+#endif
+#ifdef RT
+        mul.w   TEMP, N, K
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   B, B, TEMP
+        mul.w   TEMP, N,    LDC
+   add.d   C, C, TEMP
+   sub.d   KK, N, OFFSET
+#endif
+   andi    J,  N, 1
+   bge $r0,    J, .L20
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   sub.d   C, C,    LDC
+#endif
+MTC  c11, $r0
+   move    CO1, C
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO1,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L39
+   .align 3
+
+.L31:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   MOV c12, c11
+   srai.d  L,  KK, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  B,   4 * SIZE
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L35
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  TEMP, KK, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a2,  AO,   1 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c12, c11
+   srai.d  L, TEMP, 2
+   MOV c22, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c32, c11
+   LD b3,  BO,   4 * SIZE
+MOV    c42, c11
+   bge $r0,    L, .L35
+#endif
+   .align  3
+.L32:
+   MADD1  c11, b1, a1, c11
+   LD b4,  BO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD b2,  BO,   5 * SIZE
+   MADD3  c21, b4, a1, c21
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   5 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b4,  BO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,   6 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   7 * SIZE
+   MADD1  c11, b3, a3, c11
+   LD b2,  BO,   9 * SIZE
+   MADD3  c21, b4, a3, c21
+   LD a3,  AO,  12 * SIZE
+   MADD2  c12, b3, a2, c12
+   LD b3,  BO,  12 * SIZE
+   MADD4  c22, b4, a2, c22
+   LD a2,  AO,   9 * SIZE
+   addi.d  AO, AO,  8 * SIZE
+   addi.d  L, L, -1
+addi.d BO, BO,  8 * SIZE
+   blt $r0,    L, .L32
+   .align 3
+
+.L35:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L38
+   .align  3
+.L36:
+   MADD1  c11, b1, a1, c11
+   addi.d  L, L, -1
+   MADD3  c21, b2, a1, c21
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   2 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD a2,  AO,   3 * SIZE
+   LD b2,  BO,   3 * SIZE
+   addi.d  BO, BO,  2 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L36
+.L38:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+#if defined(LN) || defined(RT)
+   addi.d  TEMP, KK, -1
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AORIG, TEMP
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(RN) || defined(RT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  TEMP, TEMP, ZBASE_SHIFT
+   add.d   AO, AO, TEMP
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L31
+   .align 3
+
+.L39:
+#ifdef LN
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  1
+#endif
+#ifdef RT
+   addi.d  KK, KK, -1
+#endif
+   .align 3
+
+.L20:
+   andi    J,  N, 2
+   bge $r0,    J, .L30
+#ifdef RT
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 1
+   sub.d   C, C, TEMP
+#endif
+MTC  c11, $r0
+   move    CO1, C
+   add.d   CO2, C,      LDC
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO2,    LDC
+#endif
+   move    I,  M
+   bge $r0,    I, .L29
+   .align 3
+
+.L21:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  B,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  B,   1 * SIZE
+   srai.d  L,  KK, 2
+   LD b3,  B,   2 * SIZE
+   MOV c12, c11
+   LD b4,  B,   3 * SIZE
+   MOV c22, c11
+   LD b5,  B,   4 * SIZE
+   MOV c32, c11
+   MOV c42, c11
+move   BO,  B
+   bge $r0,    L, .L25
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c21, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c31, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c41, c11
+   LD b2,  BO,   1 * SIZE
+   srai.d  L,  TEMP, 2
+   LD b3,  BO,   2 * SIZE
+   MOV c12, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c22, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c32, c11
+MOV    c42, c11
+   bge $r0,    L, .L25
+#endif
+   .align  3
+.L22:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c11, b5, a1, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   8 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  12 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   9 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  10 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   6 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c11, b5, a3, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a3, c21
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   LD a3,  AO,   4 * SIZE
+   MADD2  c12, b5, a2, c12
+   LD b5,  BO,  20 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  17 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  18 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  19 * SIZE
+addi.d BO, BO, 16 * SIZE
+   blt $r0,    L, .L22
+   .align 3
+
+.L25:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L28
+   .align  3
+.L26:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   addi.d  BO, BO,  4 * SIZE
+   MADD3  c41, b4, a1, c41
+   LD a1,  AO,   2 * SIZE
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   0 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   1 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   2 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   3 * SIZE
+addi.d AO, AO,  2 * SIZE
+   blt $r0,    L, .L26
+.L28:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -2
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   LD b3,  BO,   6 * SIZE
+   LD b4,  BO,   7 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+#endif
+#ifdef RT
+   LD b5,  BO,   6 * SIZE
+   LD b6,  BO,   7 * SIZE
+   LD b7,  BO,   4 * SIZE
+   LD b8,  BO,   5 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+#endif
+MTC  c11, $r0
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 1 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+   addi.d  I, I, -1
+   blt $r0,    I, .L21
+   .align 3
+
+.L29:
+#ifdef LN
+   slli.d  TEMP, K, 1 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  2
+#endif
+#ifdef RT
+   addi.d  KK, KK, -2
+#endif
+   .align 3
+
+.L30:
+   srai.d  J,  N, 2
+nop
+   bge $r0,    J, .L999
+.L10:
+#ifdef RT
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   sub.d   B, B, TEMP
+   slli.d  TEMP,    LDC, 2
+   sub.d   C, C, TEMP
+#endif
+   move    CO1, C
+MTC  c11, $r0
+   add.d   CO2, C,      LDC
+   add.d   CO3, CO2,    LDC
+   addi.d  J, J, -1
+   add.d   CO4, CO3,    LDC
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+   move    I,  M
+#ifdef LN
+   add.d   KK, M, OFFSET
+#endif
+#ifdef LT
+   move    KK, OFFSET
+#endif
+#if defined(LN) || defined(RT)
+   move    AORIG, A
+#else
+   move    AO, A
+#endif
+#ifndef RT
+   add.d   C,  CO4,    LDC
+#endif
+MOV    c61, c11
+   bge $r0,    I, .L19
+   .align 3
+
+.L11:
+#if defined(LT) || defined(RN)
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  B,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  B,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  KK, 2
+   MOV c32, c11
+   LD b3,  B,   2 * SIZE
+   MOV c42, c11
+   LD b4,  B,   3 * SIZE
+   MOV c52, c11
+   LD b5,  B,   4 * SIZE
+   MOV c62, c11
+   LD b6,  B,   8 * SIZE
+   MOV c72, c11
+   LD b7,  B,  12 * SIZE
+   MOV c82, c11
+move   BO,  B
+   bge $r0,    L, .L15
+#else
+#ifdef LN
+   slli.d  TEMP,   K,  ZBASE_SHIFT
+   sub.d   AORIG, AORIG, TEMP
+#endif
+   slli.d  L,    KK, ZBASE_SHIFT
+   slli.d  TEMP, KK, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+   sub.d   TEMP, K, KK
+   LD a1,  AO,   0 * SIZE
+   MOV c71, c11
+   LD b1,  BO,   0 * SIZE
+   MOV c81, c11
+   LD a3,  AO,   4 * SIZE
+   MOV c12, c11
+   LD b2,  BO,   1 * SIZE
+   MOV c22, c11
+   srai.d  L,  TEMP, 2
+   MOV c32, c11
+   LD b3,  BO,   2 * SIZE
+   MOV c42, c11
+   LD b4,  BO,   3 * SIZE
+   MOV c52, c11
+   LD b5,  BO,   4 * SIZE
+   MOV c62, c11
+   LD b6,  BO,   8 * SIZE
+   MOV c72, c11
+   LD b7,  BO,  12 * SIZE
+   MOV c82, c11
+   bge $r0,    L, .L15
+#endif
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   addi.d  L, L, -1
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   bge $r0,    L, .L13
+   .align  3
+.L12:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   addi.d  L, L, -1
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   blt $r0,    L, .L12
+   .align 3
+
+.L13:
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  16 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   MADD3  c61, b2, a1, c61
+   LD a4,  AO,   2 * SIZE
+   MADD1  c71, b3, a1, c71
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   8 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  20 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   9 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  10 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  11 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   3 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  24 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  13 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  14 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  15 * SIZE
+   MADD1  c51, b7, a4, c51
+   MADD3  c61, b2, a4, c61
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  28 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  17 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  18 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  19 * SIZE
+   MADD1  c11, b1, a3, c11
+   LD a2,  AO,   5 * SIZE
+   MADD3  c21, b2, a3, c21
+   MADD1  c31, b3, a3, c31
+   MADD3  c41, b4, a3, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,  32 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  21 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  22 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  23 * SIZE
+   MADD1  c51, b5, a3, c51
+   MADD3  c61, b2, a3, c61
+   LD a4,  AO,   6 * SIZE
+   MADD1  c71, b3, a3, c71
+   MADD3  c81, b4, a3, c81
+   LD a3,  AO,  12 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,  36 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,  25 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,  26 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,  27 * SIZE
+   MADD1  c11, b6, a4, c11
+   LD a2,  AO,   7 * SIZE
+   MADD3  c21, b2, a4, c21
+   MADD1  c31, b3, a4, c31
+   MADD3  c41, b4, a4, c41
+   MADD2  c12, b6, a2, c12
+   LD b6,  BO,  40 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,  29 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,  30 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,  31 * SIZE
+   MADD1  c51, b7, a4, c51
+   addi.d  BO, BO, 32 * SIZE
+   MADD3  c61, b2, a4, c61
+   addi.d  AO, AO,  8 * SIZE
+   MADD1  c71, b3, a4, c71
+   MADD3  c81, b4, a4, c81
+   MADD2  c52, b7, a2, c52
+   LD b7,  BO,  12 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   .align 3
+
+.L15:
+#if defined(LT) || defined(RN)
+   andi    L, KK,  3
+#else
+   andi    L, TEMP, 3
+#endif
+   bge $r0,    L, .L18
+   .align  3
+.L16:
+   MADD1  c11, b1, a1, c11
+   LD a2,  AO,   1 * SIZE
+   MADD3  c21, b2, a1, c21
+   MADD1  c31, b3, a1, c31
+   MADD3  c41, b4, a1, c41
+   MADD2  c12, b1, a2, c12
+   LD b1,  BO,   8 * SIZE
+   MADD4  c22, b2, a2, c22
+   LD b2,  BO,   5 * SIZE
+   MADD2  c32, b3, a2, c32
+   LD b3,  BO,   6 * SIZE
+   MADD4  c42, b4, a2, c42
+   LD b4,  BO,   7 * SIZE
+   MADD1  c51, b5, a1, c51
+   addi.d  L, L, -1
+   MADD3  c61, b2, a1, c61
+   addi.d  AO, AO,  2 * SIZE
+   MADD1  c71, b3, a1, c71
+   addi.d  BO, BO,  8 * SIZE
+   MADD3  c81, b4, a1, c81
+   LD a1,  AO,   0 * SIZE
+   MADD2  c52, b5, a2, c52
+   LD b5,  BO,   4 * SIZE
+   MADD4  c62, b2, a2, c62
+   LD b2,  BO,   1 * SIZE
+   MADD2  c72, b3, a2, c72
+   LD b3,  BO,   2 * SIZE
+   MADD4  c82, b4, a2, c82
+   LD b4,  BO,   3 * SIZE
+   blt $r0,    L, .L16
+.L18:
+   ADD c11, c11, c22
+   ADD c12, c12, c21
+   ADD c31, c31, c42
+   ADD c32, c32, c41
+   ADD c51, c51, c62
+   ADD c52, c52, c61
+   ADD c71, c71, c82
+   ADD c72, c72, c81
+#if defined(LN) || defined(RT)
+#ifdef LN
+   addi.d  TEMP, KK, -1
+#else
+   addi.d  TEMP, KK, -4
+#endif
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AORIG, L
+   add.d   BO, B,     TEMP
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#else
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   LD b3,  AO,   2 * SIZE
+   LD b4,  AO,   3 * SIZE
+   LD b5,  AO,   4 * SIZE
+   LD b6,  AO,   5 * SIZE
+   LD b7,  AO,   6 * SIZE
+   LD b8,  AO,   7 * SIZE
+   SUB c11, b1, c11
+   SUB c12, b2, c12
+   SUB c31, b3, c31
+   SUB c32, b4, c32
+   SUB c51, b5, c51
+   SUB c52, b6, c52
+   SUB c71, b7, c71
+   SUB c72, b8, c72
+#endif
+#if defined(LN) || defined(LT)
+   LD b1,  AO,   0 * SIZE
+   LD b2,  AO,   1 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MUL a3, b2, c32
+   MUL a4, b2, c31
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   MADD5  c31, c31, b1, a3
+   MADD6  c32, c32, b1, a4
+   MUL a1, b2, c52
+   MUL a2, b2, c51
+   MUL a3, b2, c72
+   MUL a4, b2, c71
+   MADD5  c51, c51, b1, a1
+   MADD6  c52, c52, b1, a2
+   MADD5  c71, c71, b1, a3
+   MADD6  c72, c72, b1, a4
+#endif
+#ifdef RN
+   LD b1,  BO,   0 * SIZE
+   LD b2,  BO,   1 * SIZE
+   LD b3,  BO,   2 * SIZE
+   LD b4,  BO,   3 * SIZE
+   LD b5,  BO,   4 * SIZE
+   LD b6,  BO,   5 * SIZE
+   LD b7,  BO,   6 * SIZE
+   LD b8,  BO,   7 * SIZE
+   MUL a1, b2, c12
+   MUL a2, b2, c11
+   MADD5  c11, c11, b1, a1
+   MADD6  c12, c12, b1, a2
+   NMSUB  c31, c11, b3, c31
+   MADD7  c32, c11, b4, c32
+   NMSUB  c51, c11, b5, c51
+   MADD7  c52, c11, b6, c52
+   NMSUB  c71, c11, b7, c71
+   MADD7  c72, c11, b8, c72
+   MADD8  c31, c12, b4, c31
+   NMSUB  c32, c12, b3, c32
+   MADD8  c51, c12, b6, c51
+   NMSUB  c52, c12, b5, c52
+   MADD8  c71, c12, b8, c71
+   NMSUB  c72, c12, b7, c72
+   LD b3,  BO,  10 * SIZE
+   LD b4,  BO,  11 * SIZE
+   LD b5,  BO,  12 * SIZE
+   LD b6,  BO,  13 * SIZE
+   LD b7,  BO,  14 * SIZE
+   LD b8,  BO,  15 * SIZE
+   MUL a1, b4, c32
+   MUL a2, b4, c31
+   MADD5  c31, c31, b3, a1
+   MADD6  c32, c32, b3, a2
+   NMSUB  c51, c31, b5, c51
+   MADD7  c52, c31, b6, c52
+   NMSUB  c71, c31, b7, c71
+   MADD7  c72, c31, b8, c72
+   MADD8  c51, c32, b6, c51
+   NMSUB  c52, c32, b5, c52
+   MADD8  c71, c32, b8, c71
+   NMSUB  c72, c32, b7, c72
+   LD b5,  BO,  20 * SIZE
+   LD b6,  BO,  21 * SIZE
+   LD b7,  BO,  22 * SIZE
+   LD b8,  BO,  23 * SIZE
+   MUL a1, b6, c52
+   MUL a2, b6, c51
+   MADD5  c51, c51, b5, a1
+   MADD6  c52, c52, b5, a2
+   NMSUB  c71, c51, b7, c71
+   MADD7  c72, c51, b8, c72
+   MADD8  c71, c52, b8, c71
+   NMSUB  c72, c52, b7, c72
+   LD b7,  BO,  30 * SIZE
+   LD b8,  BO,  31 * SIZE
+   MUL a1, b8, c72
+   MUL a2, b8, c71
+   MADD5  c71, c71, b7, a1
+   MADD6  c72, c72, b7, a2
+#endif
+#ifdef RT
+   LD b1,  BO,  30 * SIZE
+   LD b2,  BO,  31 * SIZE
+   LD b3,  BO,  28 * SIZE
+   LD b4,  BO,  29 * SIZE
+   LD b5,  BO,  26 * SIZE
+   LD b6,  BO,  27 * SIZE
+   LD b7,  BO,  24 * SIZE
+   LD b8,  BO,  25 * SIZE
+   MUL a1, b2, c72
+   MUL a2, b2, c71
+   MADD5  c71, c71, b1, a1
+   MADD6  c72, c72, b1, a2
+   NMSUB  c51, c71, b3, c51
+   MADD7  c52, c71, b4, c52
+   NMSUB  c31, c71, b5, c31
+   MADD7  c32, c71, b6, c32
+   NMSUB  c11, c71, b7, c11
+   MADD7  c12, c71, b8, c12
+   MADD8  c51, c72, b4, c51
+   NMSUB  c52, c72, b3, c52
+   MADD8  c31, c72, b6, c31
+   NMSUB  c32, c72, b5, c32
+   MADD8  c11, c72, b8, c11
+   NMSUB  c12, c72, b7, c12
+   LD b3,  BO,  20 * SIZE
+   LD b4,  BO,  21 * SIZE
+   LD b5,  BO,  18 * SIZE
+   LD b6,  BO,  19 * SIZE
+   LD b7,  BO,  16 * SIZE
+   LD b8,  BO,  17 * SIZE
+   MUL a1, b4, c52
+   MUL a2, b4, c51
+   MADD5  c51, c51, b3, a1
+   MADD6  c52, c52, b3, a2
+   NMSUB  c31, c51, b5, c31
+   MADD7  c32, c51, b6, c32
+   NMSUB  c11, c51, b7, c11
+   MADD7  c12, c51, b8, c12
+   MADD8  c31, c52, b6, c31
+   NMSUB  c32, c52, b5, c32
+   MADD8  c11, c52, b8, c11
+   NMSUB  c12, c52, b7, c12
+   LD b5,  BO,  10 * SIZE
+   LD b6,  BO,  11 * SIZE
+   LD b7,  BO,   8 * SIZE
+   LD b8,  BO,   9 * SIZE
+   MUL a1, b6, c32
+   MUL a2, b6, c31
+   MADD5  c31, c31, b5, a1
+   MADD6  c32, c32, b5, a2
+   NMSUB  c11, c31, b7, c11
+   MADD7  c12, c31, b8, c12
+   MADD8  c11, c32, b8, c11
+   NMSUB  c12, c32, b7, c12
+   LD b7,  BO,   0 * SIZE
+   LD b8,  BO,   1 * SIZE
+   MUL a1, b8, c12
+   MUL a2, b8, c11
+   MADD5  c11, c11, b7, a1
+   MADD6  c12, c12, b7, a2
+#endif
+#if defined(LN) || defined(LT)
+   ST c11,  BO,   0 * SIZE
+   ST c12,  BO,   1 * SIZE
+   ST c31,  BO,   2 * SIZE
+   ST c32,  BO,   3 * SIZE
+   ST c51,  BO,   4 * SIZE
+   ST c52,  BO,   5 * SIZE
+   ST c71,  BO,   6 * SIZE
+   ST c72,  BO,   7 * SIZE
+#else
+   ST c11,  AO,   0 * SIZE
+   ST c12,  AO,   1 * SIZE
+   ST c31,  AO,   2 * SIZE
+   ST c32,  AO,   3 * SIZE
+   ST c51,  AO,   4 * SIZE
+   ST c52,  AO,   5 * SIZE
+   ST c71,  AO,   6 * SIZE
+   ST c72,  AO,   7 * SIZE
+#endif
+#ifdef LN
+   addi.d  CO1,CO1, -2 * SIZE
+   addi.d  CO2,CO2, -2 * SIZE
+   addi.d  CO3,CO3, -2 * SIZE
+   addi.d  CO4,CO4, -2 * SIZE
+#endif
+   ST c11,  CO1,   0 * SIZE
+   ST c12,  CO1,   1 * SIZE
+   ST c31,  CO2,   0 * SIZE
+   ST c32,  CO2,   1 * SIZE
+   ST c51,  CO3,   0 * SIZE
+   ST c52,  CO3,   1 * SIZE
+   ST c71,  CO4,   0 * SIZE
+   ST c72,  CO4,   1 * SIZE
+#ifndef LN
+   addi.d  CO1,CO1, 2 * SIZE
+   addi.d  CO2,CO2, 2 * SIZE
+   addi.d  CO3,CO3, 2 * SIZE
+   addi.d  CO4,CO4, 2 * SIZE
+#endif
+#ifdef RT
+   slli.d  TEMP, K, ZBASE_SHIFT
+   add.d   AORIG, AORIG, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   sub.d   TEMP, K, KK
+   slli.d  L,    TEMP, ZBASE_SHIFT
+   slli.d  TEMP, TEMP, 2 + ZBASE_SHIFT
+   add.d   AO, AO, L
+   add.d   BO, BO, TEMP
+#endif
+#ifdef LT
+   addi.d  KK, KK, 1
+#endif
+#ifdef LN
+   addi.d  KK, KK, -1
+#endif
+MTC  c11, $r0
+   addi.d  I, I, -1
+   MOV c21, c11
+   MOV c31, c11
+   MOV c41, c11
+   MOV c51, c11
+MOV    c61, c11
+   blt $r0,    I, .L11
+   .align 3
+
+.L19:
+#ifdef LN
+   slli.d  TEMP, K, 2 + ZBASE_SHIFT
+   add.d   B, B, TEMP
+#endif
+#if defined(LT) || defined(RN)
+   move    B,  BO
+#endif
+#ifdef RN
+   addi.d  KK, KK,  4
+#endif
+#ifdef RT
+   addi.d  KK, KK, -4
+#endif
+   blt $r0,    J, .L10
+   .align 3
+
+.L999:
+   LDARG  $r23,  $sp,    0
+   LDARG  $r24,  $sp,    8
+   LDARG  $r25,  $sp,   16
+   LDARG  $r26,  $sp,   24
+   LDARG  $r27,  $sp,   32
+   LDARG  $r28,  $sp,   40
+   fld.d  $f24,  $sp,  48
+   fld.d  $f25,  $sp,  56
+   fld.d  $f26,  $sp,  64
+   fld.d  $f27,  $sp,  72
+#ifndef __64BIT__
+   fld.d  $f18,  $sp,  88
+   fld.d  $f19,  $sp,  96
+   fld.d  $f20,  $sp, 104
+   fld.d  $f21,  $sp, 112
+#endif
+   addi.d  $sp, $sp, 128
+   move $r4, $r17
+   fmov.d $f0, $f22
+   jirl    $r0, $r1, 0x0
+   EPILOGUE
diff --git a/kernel/mips/KERNEL.generic b/kernel/mips/KERNEL.generic
new file mode 100644
index 000000000..17f2ef976
--- /dev/null
+++ b/kernel/mips/KERNEL.generic
@@ -0,0 +1,160 @@
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Pure C for other kernels
+SAMAXKERNEL  = ../mips/amax.c
+DAMAXKERNEL  = ../mips/amax.c
+CAMAXKERNEL  = ../mips/zamax.c
+ZAMAXKERNEL  = ../mips/zamax.c
+
+SAMINKERNEL  = ../mips/amin.c
+DAMINKERNEL  = ../mips/amin.c
+CAMINKERNEL  = ../mips/zamin.c
+ZAMINKERNEL  = ../mips/zamin.c
+
+SMAXKERNEL   = ../mips/max.c
+DMAXKERNEL   = ../mips/max.c
+
+SMINKERNEL   = ../mips/min.c
+DMINKERNEL   = ../mips/min.c
+
+ISAMAXKERNEL = ../mips/iamax.c
+IDAMAXKERNEL = ../mips/iamax.c
+ICAMAXKERNEL = ../mips/izamax.c
+IZAMAXKERNEL = ../mips/izamax.c
+
+ISAMINKERNEL = ../mips/iamin.c
+IDAMINKERNEL = ../mips/iamin.c
+ICAMINKERNEL = ../mips/izamin.c
+IZAMINKERNEL = ../mips/izamin.c
+
+ISMAXKERNEL  = ../mips/imax.c
+IDMAXKERNEL  = ../mips/imax.c
+
+ISMINKERNEL  = ../mips/imin.c
+IDMINKERNEL  = ../mips/imin.c
+
+SASUMKERNEL  = ../mips/asum.c
+DASUMKERNEL  = ../mips/asum.c
+CASUMKERNEL  = ../mips/zasum.c
+ZASUMKERNEL  = ../mips/zasum.c
+
+SSUMKERNEL  = ../mips/sum.c
+DSUMKERNEL  = ../mips/sum.c
+CSUMKERNEL  = ../mips/zsum.c
+ZSUMKERNEL  = ../mips/zsum.c
+
+SAXPYKERNEL  = ../mips/axpy.c
+DAXPYKERNEL  = ../mips/axpy.c
+CAXPYKERNEL  = ../mips/zaxpy.c
+ZAXPYKERNEL  = ../mips/zaxpy.c
+
+SCOPYKERNEL  = ../mips/copy.c
+DCOPYKERNEL  = ../mips/copy.c
+CCOPYKERNEL  = ../mips/zcopy.c
+ZCOPYKERNEL  = ../mips/zcopy.c
+
+SDOTKERNEL   = ../mips/dot.c
+DDOTKERNEL   = ../mips/dot.c
+CDOTKERNEL   = ../mips/zdot.c
+ZDOTKERNEL   = ../mips/zdot.c
+
+SNRM2KERNEL  = ../mips/nrm2.c
+DNRM2KERNEL  = ../mips/nrm2.c
+CNRM2KERNEL  = ../mips/znrm2.c
+ZNRM2KERNEL  = ../mips/znrm2.c
+
+SROTKERNEL   = ../mips/rot.c
+DROTKERNEL   = ../mips/rot.c
+CROTKERNEL   = ../mips/zrot.c
+ZROTKERNEL   = ../mips/zrot.c
+
+SSCALKERNEL  = ../mips/scal.c
+DSCALKERNEL  = ../mips/scal.c
+CSCALKERNEL  = ../mips/zscal.c
+ZSCALKERNEL  = ../mips/zscal.c
+
+SSWAPKERNEL  = ../mips/swap.c
+DSWAPKERNEL  = ../mips/swap.c
+CSWAPKERNEL  = ../mips/zswap.c
+ZSWAPKERNEL  = ../mips/zswap.c
+
+SGEMVNKERNEL = ../mips/gemv_n.c
+DGEMVNKERNEL = ../mips/gemv_n.c
+CGEMVNKERNEL = ../mips/zgemv_n.c
+ZGEMVNKERNEL = ../mips/zgemv_n.c
+
+SGEMVTKERNEL = ../mips/gemv_t.c
+DGEMVTKERNEL = ../mips/gemv_t.c
+CGEMVTKERNEL = ../mips/zgemv_t.c
+ZGEMVTKERNEL = ../mips/zgemv_t.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+QSYMV_U_KERNEL =  ../generic/symv_k.c
+QSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c
index 8b624be88..aa3f1dcfa 100644
--- a/kernel/mips/cgemm_kernel_8x4_msa.c
+++ b/kernel/mips/cgemm_kernel_8x4_msa.c
@@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
 {                                                                     \
     LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);               \
-    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0));        \
     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
                                                                       \
     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
@@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
 {                                                                     \
     LD_SP2_INC(pa0, 4, src_a0, src_a1);                               \
-    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0));        \
     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
                                                                       \
     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c
index 12fa7ca02..c1eb9bbfd 100644
--- a/kernel/mips/cgemv_n_msa.c
+++ b/kernel/mips/cgemv_n_msa.c
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #if !defined(XCONJ)
         #define OP0  +=
         #define OP1  -=
-        #define OP2  -=
+        #define OP2  +=
     #else
         #define OP0  -=
         #define OP1  -=
-        #define OP2  +=
+        #define OP2  -=
     #endif
 #endif
 
diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c
index 584e3de75..800667b6e 100644
--- a/kernel/mips/cgemv_t_msa.c
+++ b/kernel/mips/cgemv_t_msa.c
@@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #undef OP1
 #undef OP2
 
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    #define OP0  -=
-    #define OP1  +=
-    #define OP2  +=
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
 #else
-    #define OP0  +=
-    #define OP1  +=
-    #define OP2  -=
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  +=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  -=
+    #endif
 #endif
 
 #define CGEMV_T_8x4()                        \
diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c
index 5273e38a3..84eb54d6d 100644
--- a/kernel/mips/crot_msa.c
+++ b/kernel/mips/crot_msa.c
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
     {
         if ((0 == c) && (0 == s))
         {
-            v4f32 zero = __msa_cast_to_vector_float(0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
+            v4f32 zero = {0.0, 0.0, 0.0, 0.0};
 
             /* process 2 elements */
             for (j = (n >> 1); j--;)
diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c
index 11a1450cf..451d0c921 100644
--- a/kernel/mips/cscal_msa.c
+++ b/kernel/mips/cscal_msa.c
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     {
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v4f32 zero_v = __msa_cast_to_vector_float(0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+            v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
 
             for (i = (n >> 5); i--;)
             {
diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c
index 6ce0375ab..2e41d8bef 100644
--- a/kernel/mips/dscal_msa.c
+++ b/kernel/mips/dscal_msa.c
@@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
     {
         if (0.0 == da)
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 5); i--;)
             {
diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c
index 7b1f02477..67e97f710 100644
--- a/kernel/mips/dswap_msa.c
+++ b/kernel/mips/dswap_msa.c
@@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-    else
+    else if ((inc_x != 0) && (inc_y != 0))
     {
         for (i = (n >> 3); i--;)
         {
@@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-
+    else
+    {
+        if (inc_x == inc_y)
+        {
+            if (n & 1)
+            {
+                x0 = *srcx;
+                *srcx  = *srcy;
+                *srcy  = x0;
+            }
+            else
+                return (0);
+        }
+        else
+        {
+            BLASLONG ix = 0, iy = 0;
+            while (i < n)
+            {
+                x0 = srcx[ix];
+                srcx[ix] = srcy[iy];
+                srcy[iy] = x0;
+                ix += inc_x;
+                iy += inc_y;
+                i++;
+            }
+        }
+    }
     return (0);
 }
diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
index 9fb5141ca..e2cd3aa4b 100644
--- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
@@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
     ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
 
-    src_a54 = __msa_cast_to_vector_double(*(a + 54));
-    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+    src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54));
     src_a62 = LD_DP(a + 62);
     src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
     src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
@@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a44 = LD_DP(a + 44);
     src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
     src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
-    src_a36 = __msa_cast_to_vector_double(*(a + 36));
-    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+    src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36));
 
     res_c7 *= src_a63;
     res_c6 -= res_c7 * src_a62;
@@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a26 = LD_DP(a + 26);
     src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
     src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
-    src_a18 = __msa_cast_to_vector_double(*(a + 18));
-    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+    src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18));
 
     res_c3 -= res_c7 * src_a59;
     res_c2 -= res_c7 * src_a58;
@@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a8 = LD_DP(a + 8);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
     src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     res_c1 -= res_c2 * src_a17;
     res_c1 *= src_a9;
@@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a52 = LD_DP(a - 12);
     src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
     src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
-    src_a54 = __msa_cast_to_vector_double(*(a - 10));
-    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+    src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10));
 
     src_a40 = LD_DP(a - 24);
     src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
@@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a34 = LD_DP(a - 30);
     src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
     src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
-    src_a36 = __msa_cast_to_vector_double(*(a - 28));
-    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+    src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28));
 
     res_c4 *= src_a36;
     res_c3 -= res_c4 * src_a35;
@@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a16 = LD_DP(a - 48);
     src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
     src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
-    src_a18 = __msa_cast_to_vector_double(*(a - 46));
-    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
-    src_a0 = __msa_cast_to_vector_double(*(a - 64));
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64));
     src_a8 = LD_DP(a - 56);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
     src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
@@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
 
-    src_a8 = __msa_cast_to_vector_double(*(a + 8));
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-
-    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     src_a4 = LD_DP(a + 4);
     src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
@@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
 
-    src_a8 = __msa_cast_to_vector_double(*(a + 8));
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-
-    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     src_a4 = LD_DP(a + 4);
     src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
index 525fc8585..74cc1278a 100644
--- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
@@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c8 * src_a6;
     res_c15 -= res_c8 * src_a7;
 
-    src_a9 = __msa_cast_to_vector_double(*(a + 9));
-    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+    src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c10 * src_a22;
     res_c15 -= res_c10 * src_a23;
 
-    src_a27 = __msa_cast_to_vector_double(*(a + 27));
-    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+    src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
     src_a28 = LD_DP(a + 28);
     src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
     src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c12 * src_a38;
     res_c15 -= res_c12 * src_a39;
 
-    src_a45 = __msa_cast_to_vector_double(*(a + 45));
-    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+    src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
     src_a46 = LD_DP(a + 46);
     src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
     src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
     ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
 
-    src_a63 = __msa_cast_to_vector_double(*(a + 63));
-    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+    src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
     src_a54 = LD_DP(a + 54);
     src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
     src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c0 * src_a6;
     res_c7 -= res_c0 * src_a7;
 
-    src_a9 = __msa_cast_to_vector_double(*(a + 9));
-    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+    src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c2 * src_a22;
     res_c7 -= res_c2 * src_a23;
 
-    src_a27 = __msa_cast_to_vector_double(*(a + 27));
-    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+    src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
     src_a28 = LD_DP(a + 28);
     src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
     src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c4 * src_a38;
     res_c7 -= res_c4 * src_a39;
 
-    src_a45 = __msa_cast_to_vector_double(*(a + 45));
-    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+    src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
     src_a46 = LD_DP(a + 46);
     src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
     src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c5 * src_a46;
     res_c7 -= res_c5 * src_a47;
 
-    src_a63 = __msa_cast_to_vector_double(*(a + 63));
-    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+    src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
     src_a54 = LD_DP(a + 54);
     src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
     src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c4 * src_a2;
     res_c7 -= res_c4 * src_a3;
 
-    src_a5 = __msa_cast_to_vector_double(*(a + 5));
-    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+    src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
     src_a6 = LD_DP(a + 6);
     src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
     src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
-    src_a15 = __msa_cast_to_vector_double(*(a + 15));
-    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+    src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
 
     res_c2 *= src_a10;
     res_c3 -= res_c2 * src_a11;
@@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c2 -= res_c0 * src_a2;
     res_c3 -= res_c0 * src_a3;
 
-    src_a5 = __msa_cast_to_vector_double(*(a + 5));
-    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+    src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
     src_a6 = LD_DP(a + 6);
     src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
     src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
-    src_a15 = __msa_cast_to_vector_double(*(a + 15));
-    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+    src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
 
     res_c2 *= src_a10;
     res_c3 -= res_c2 * src_a11;
diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
index cb361c511..03036f1c7 100644
--- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
@@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
-    src_b5 = __msa_cast_to_vector_double(*(b + 5));
-    src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+    src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
     src_b6 = LD_DP(b + 6);
     src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
     src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
     src_b10 = LD_DP(b + 10);
     src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
     src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
-    src_b15 = __msa_cast_to_vector_double(*(b + 15));
-    src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+    src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b0 = LD_DP(b + 0);
     src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
     src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
-    src_b3 = __msa_cast_to_vector_double(*(b + 3));
-    src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+    src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
         }
     }
 
-    src_b0 = __msa_cast_to_vector_double(*b);
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
-    src_b5 = __msa_cast_to_vector_double(*(b + 5));
-    src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+    src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
     src_b6 = LD_DP(b + 6);
     src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
     src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
     src_b10 = LD_DP(b + 10);
     src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
     src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
-    src_b15 = __msa_cast_to_vector_double(*(b + 15));
-    src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+    src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b0 = LD_DP(b + 0);
     src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
     src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
-    src_b3 = __msa_cast_to_vector_double(*(b + 3));
-    src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+    src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
index 581a90f71..4c55a0f37 100644
--- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
@@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_b8 = LD_DP(b + 8);
     src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
     src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
-    src_b10 = __msa_cast_to_vector_double(*(b + 10));
-    src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+    src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b4 = LD_DP(b + 4);
     src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
     src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 16;
     b -= 4;
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
@@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
     a -= 8;
     b -= 1;
 
-    src_b0 = __msa_cast_to_vector_double(*b);
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b8 = LD_DP(b + 8);
     src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
     src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
-    src_b10 = __msa_cast_to_vector_double(*(b + 10));
-    src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+    src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b4 = LD_DP(b + 4);
     src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
     src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 8;
     b -= 4;
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h
index ee0dea0b7..b887800ed 100644
--- a/kernel/mips/macros_msa.h
+++ b/kernel/mips/macros_msa.h
@@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src)
 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
 
 #define COPY_FLOAT_TO_VECTOR(a) ( {                \
-    v4f32  out;                                    \
-    out = __msa_cast_to_vector_float(a);           \
-    out = (v4f32) __msa_splati_w((v4i32) out, 0);  \
+    v4f32  out = {a, a, a, a};                     \
     out;                                           \
 } )
 
 #define COPY_DOUBLE_TO_VECTOR(a) ( {               \
-    v2f64  out;                                    \
-    out = __msa_cast_to_vector_double(a);          \
-    out = (v2f64) __msa_splati_d((v2i64) out, 0);  \
+    v2f64  out = {a, a};                           \
     out;                                           \
 } )
 
diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c
index 75730241a..79d921b7a 100644
--- a/kernel/mips/srot_msa.c
+++ b/kernel/mips/srot_msa.c
@@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
     {
         if ((0 == c) && (0 == s))
         {
-            v4f32 zero = __msa_cast_to_vector_float(0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
+            v4f32 zero = {0.0, 0.0, 0.0, 0.0};
 
             /* process 4 floats */
             for (j = (n >> 2); j--;)
diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c
index 64b62d659..66e17b844 100644
--- a/kernel/mips/sscal_msa.c
+++ b/kernel/mips/sscal_msa.c
@@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
     {
         if (0.0 == da)
         {
-            v4f32 zero_v = __msa_cast_to_vector_float(0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+            v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
 
             for (i = (n >> 6); i--;)
             {
diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c
index 46fa8aa87..d412285b0 100644
--- a/kernel/mips/sswap_msa.c
+++ b/kernel/mips/sswap_msa.c
@@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-    else
+    else if ((inc_x != 0) && (inc_y != 0))
     {
         for (i = (n >> 3); i--;)
         {
@@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
+    else
+    {
+        if (inc_x == inc_y)
+        {
+            if (n & 1)
+            {
+                x0 = *srcx;
+                *srcx  = *srcy;
+                *srcy  = x0;
+            }
+            else
+                return (0);
+        }
+        else
+        {
+            BLASLONG ix = 0, iy = 0;
+            while (i < n)
+            {
+                x0 = srcx[ix];
+                srcx[ix] = srcy[iy];
+                srcy[iy] = x0;
+                ix += inc_x;
+                iy += inc_y;
+                i++;
+            }
+        }
+    }
 
     return (0);
 }
diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c
index 669c25758..97a80b4ba 100644
--- a/kernel/mips/zgemv_n_msa.c
+++ b/kernel/mips/zgemv_n_msa.c
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #if !defined(XCONJ)
         #define OP0  +=
         #define OP1  -=
-        #define OP2  -=
+        #define OP2  +=
     #else
         #define OP0  -=
         #define OP1  -=
-        #define OP2  +=
+        #define OP2  -=
     #endif
 #endif
 
diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c
index e6febb577..6492f90be 100644
--- a/kernel/mips/zgemv_t_msa.c
+++ b/kernel/mips/zgemv_t_msa.c
@@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #undef OP3
 #undef OP4
 
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    #define OP0  -=
-    #define OP1  +=
-    #define OP2  +=
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
 #else
-    #define OP0  +=
-    #define OP1  +=
-    #define OP2  -=
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  +=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  -=
+    #endif
 #endif
 
 #define ZGEMV_T_8x1()                     \
diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c
index 5a8766d3c..a45c3cecd 100644
--- a/kernel/mips/zscal_msa.c
+++ b/kernel/mips/zscal_msa.c
@@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     {
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 4); i--;)
             {
@@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 4); i--;)
             {
diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B
deleted file mode 100644
index e476c631e..000000000
--- a/kernel/mips64/KERNEL.LOONGSON3B
+++ /dev/null
@@ -1,64 +0,0 @@
-SAXPYKERNEL=axpy_loongson3a.S
-DAXPYKERNEL=daxpy_loongson3a_simd.S
-
-SGEMVNKERNEL = gemv_n_loongson3a.c
-SGEMVTKERNEL = gemv_t_loongson3a.c
-DGEMVNKERNEL = gemv_n_loongson3a.c
-DGEMVTKERNEL = gemv_t_loongson3a.c
-CGEMVNKERNEL = zgemv_n_loongson3a.c
-CGEMVTKERNEL = zgemv_t_loongson3a.c
-ZGEMVNKERNEL = zgemv_n_loongson3a.c
-ZGEMVTKERNEL = zgemv_t_loongson3a.c
-
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
-DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
-CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
-DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
-
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
-
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-
-
-
diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3
similarity index 75%
rename from kernel/mips64/KERNEL.LOONGSON3A
rename to kernel/mips64/KERNEL.LOONGSON3R3
index 0298faaad..904828d57 100644
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3R3
@@ -16,32 +16,32 @@ SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =  sgemm_incopy.o
-SGEMMITCOPYOBJ =  sgemm_itcopy.o
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_loongson3a_4x4.S
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    =  cgemm_kernel_loongson3a_4x2_ps.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMITCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  cgemm_incopy.o
-CGEMMITCOPYOBJ =  cgemm_itcopy.o
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    =  zgemm_kernel_loongson3a_2x2.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
@@ -64,6 +64,3 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 DSDOTKERNEL     = ../mips/dot.c
-
-
-
diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4
new file mode 100644
index 000000000..b81e5441d
--- /dev/null
+++ b/kernel/mips64/KERNEL.LOONGSON3R4
@@ -0,0 +1,192 @@
+ifdef HAVE_MSA
+SAXPYKERNEL = ../mips/saxpy_msa.c
+DAXPYKERNEL = ../mips/daxpy_msa.c
+CAXPYKERNEL = ../mips/caxpy_msa.c
+ZAXPYKERNEL = ../mips/zaxpy_msa.c
+else
+SAXPYKERNEL = axpy_loongson3a.S
+DAXPYKERNEL = daxpy_loongson3a_simd.S
+endif
+
+ifdef HAVE_MSA
+SCOPYKERNEL  = ../mips/scopy_msa.c
+DCOPYKERNEL  = ../mips/dcopy_msa.c
+CCOPYKERNEL  = ../mips/ccopy_msa.c
+ZCOPYKERNEL  = ../mips/zcopy_msa.c
+endif
+
+ifdef HAVE_MSA
+SDOTKERNEL   = ../mips/sdot_msa.c
+DDOTKERNEL   = ../mips/ddot_msa.c
+CDOTKERNEL   = ../mips/cdot_msa.c
+ZDOTKERNEL   = ../mips/zdot_msa.c
+endif
+DSDOTKERNEL  = ../mips/dot.c
+
+ifdef HAVE_MSA
+SROTKERNEL   = ../mips/srot_msa.c
+DROTKERNEL   = ../mips/drot_msa.c
+CROTKERNEL   = ../mips/crot_msa.c
+ZROTKERNEL   = ../mips/zrot_msa.c
+endif
+
+ifdef HAVE_MSA
+SSCALKERNEL  = ../mips/sscal_msa.c
+DSCALKERNEL  = ../mips/dscal_msa.c
+CSCALKERNEL  = ../mips/cscal_msa.c
+ZSCALKERNEL  = ../mips/zscal_msa.c
+endif
+
+ifdef HAVE_MSA
+SGEMVNKERNEL = ../mips/sgemv_n_msa.c
+DGEMVNKERNEL = ../mips/dgemv_n_msa.c
+SGEMVTKERNEL = ../mips/sgemv_t_msa.c
+DGEMVTKERNEL = ../mips/dgemv_t_msa.c
+CGEMVNKERNEL = ../mips/cgemv_n_msa.c
+CGEMVTKERNEL = ../mips/cgemv_t_msa.c
+ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
+ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
+else
+SGEMVNKERNEL = gemv_n_loongson3a.c
+SGEMVTKERNEL = gemv_t_loongson3a.c
+DGEMVNKERNEL = gemv_n_loongson3a.c
+DGEMVTKERNEL = gemv_t_loongson3a.c
+CGEMVNKERNEL = zgemv_n_loongson3a.c
+CGEMVTKERNEL = zgemv_t_loongson3a.c
+ZGEMVNKERNEL = zgemv_n_loongson3a.c
+ZGEMVTKERNEL = zgemv_t_loongson3a.c
+endif
+
+ifdef HAVE_MSA
+SASUMKERNEL  = ../mips/sasum_msa.c
+DASUMKERNEL  = ../mips/dasum_msa.c
+CASUMKERNEL  = ../mips/casum_msa.c
+ZASUMKERNEL  = ../mips/zasum_msa.c
+endif
+
+ifdef HAVE_MSA
+SSWAPKERNEL  = ../mips/sswap_msa.c
+DSWAPKERNEL  = ../mips/dswap_msa.c
+CSWAPKERNEL  = ../mips/cswap_msa.c
+ZSWAPKERNEL  = ../mips/zswap_msa.c
+endif
+
+ifdef HAVE_MSA
+SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
+SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
+SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+SGEMMKERNEL    =  sgemm_kernel_8x4_ps.S
+SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
+DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
+DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
+DGEMMONCOPY    = ../mips/dgemm_ncopy_4_msa.c
+DGEMMOTCOPY    = ../mips/dgemm_tcopy_4_msa.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+DGEMMKERNEL    =  dgemm_kernel_loongson3a_4x4.S
+DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
+CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
+CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
+CGEMMONCOPY    = ../mips/cgemm_ncopy_4_msa.c
+CGEMMOTCOPY    = ../mips/cgemm_tcopy_4_msa.c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+CGEMMKERNEL    =  cgemm_kernel_loongson3a_4x2_ps.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
+ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
+ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+ZGEMMKERNEL    =  zgemm_kernel_loongson3a_2x2.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
+STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
+STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
+STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
+else
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
+DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
+DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
+DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
+else
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index c25cd9f04..79d889fe0 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -1,7 +1,6 @@
-ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
+ifeq ($(HAVE_GAS), 1)
 include $(KERNELDIR)/KERNEL.POWER8
 else
-
 #SGEMM_BETA = ../generic/gemm_beta.c
 #DGEMM_BETA = ../generic/gemm_beta.c
 #CGEMM_BETA = ../generic/zgemm_beta.c
@@ -33,6 +32,16 @@ SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
+SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c
+SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c
+SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c
+SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c
+SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c
+SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c
+SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c
+SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c
+
 DGEMMKERNEL    =  dgemm_kernel_power10.c
 DGEMMINCOPY    =
 DGEMMITCOPY    =
@@ -43,7 +52,18 @@ DGEMMITCOPYOBJ =
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c
+DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c
+DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c
+DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c
+DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c
+DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c
+DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
+DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
+DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
+
 CGEMMKERNEL    = cgemm_kernel_power10.S
+#CGEMMKERNEL     = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
@@ -63,15 +83,15 @@ ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+STRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+STRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+STRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
@@ -154,11 +174,7 @@ ZCOPYKERNEL  = zcopy_power10.c
 SDOTKERNEL   =  sdot_power10.c
 DDOTKERNEL   =  ddot_power10.c
 DSDOTKERNEL  =  sdot_power10.c
-ifneq ($(GCCVERSIONGTEQ9),1)
-CDOTKERNEL   =  cdot_power9.S
-else
 CDOTKERNEL   =  cdot.c
-endif
 ZDOTKERNEL   =  zdot.c
 #
 SNRM2KERNEL  = ../arm/nrm2.c
@@ -173,8 +189,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
@@ -185,7 +206,7 @@ ZSWAPKERNEL  = zswap.c
 SGEMVNKERNEL = sgemv_n.c
 DGEMVNKERNEL = dgemv_n_power10.c
 CGEMVNKERNEL = cgemv_n.c
-ZGEMVNKERNEL = zgemv_n_4.c
+ZGEMVNKERNEL =  zgemv_n_power10.c
 #
 SGEMVTKERNEL = sgemv_t.c
 DGEMVTKERNEL = dgemv_t_power10.c
@@ -217,5 +238,4 @@ QCABS_KERNEL	= ../generic/cabs.c
 #Dump kernel
 CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
-
 endif
diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5
index fbef79e59..bea7b17c8 100644
--- a/kernel/power/KERNEL.POWER5
+++ b/kernel/power/KERNEL.POWER5
@@ -54,3 +54,8 @@ ZTRSMKERNEL_LN	=  ztrsm_kernel_LN.S
 ZTRSMKERNEL_LT	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RN	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RT	=  ztrsm_kernel_RT.S
+
+CROTKERNEL = ../arm/zrot.c
+ZROTKERNEL = ../arm/zrot.c
+SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index c2f4cd204..2b8e65948 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -242,8 +242,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index ab8fbfcd9..b6b102b3e 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+STRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+STRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+STRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_power10.c
 DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
@@ -166,8 +166,13 @@ ZROTKERNEL   = zrot.c
 #
 SSCALKERNEL  = sscal.c
 DSCALKERNEL  = dscal.c
+ifeq ($(C_COMPILER), PGI)
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+else
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c
+endif
 #
 SSWAPKERNEL  = sswap.c
 DSWAPKERNEL  = dswap.c
diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440
index 677af5f21..fd9a8c780 100644
--- a/kernel/power/KERNEL.PPC440
+++ b/kernel/power/KERNEL.PPC440
@@ -16,11 +16,11 @@ ZASUMKERNEL	= zasum_ppc440.S
 SAXPYKERNEL	=  axpy_ppc440.S
 DAXPYKERNEL	=  axpy_ppc440.S
 ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
-CAXPYKERNEL	= ../arm/zaxpy.c
-ZAXPYKERNEL	= ../arm/zaxpy.c
-else
 CAXPYKERNEL	= zaxpy_ppc440.S
 ZAXPYKERNEL	= zaxpy_ppc440.S
+else
+CAXPYKERNEL	= ../arm/zaxpy.c
+ZAXPYKERNEL	= ../arm/zaxpy.c
 endif
 
 SDOTKERNEL	=  dot_ppc440.S
diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4
index 54660b54d..1bdd3119e 100644
--- a/kernel/power/KERNEL.PPCG4
+++ b/kernel/power/KERNEL.PPCG4
@@ -15,8 +15,13 @@ ZASUMKERNEL	= zasum_ppc440.S
 
 SAXPYKERNEL	=  axpy_ppc440.S
 DAXPYKERNEL	=  axpy_ppc440.S
+ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
 CAXPYKERNEL	= zaxpy_ppc440.S
 ZAXPYKERNEL	= zaxpy_ppc440.S
+else
+CAXPYKERNEL    = ../arm/zaxpy.c
+ZAXPYKERNEL    = ../arm/zaxpy.c
+endif
 
 SDOTKERNEL	=  dot_ppc440.S
 DDOTKERNEL	=  dot_ppc440.S
diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c
index 0d13416b3..902eba82c 100644
--- a/kernel/power/caxpy_microk_power10.c
+++ b/kernel/power/caxpy_microk_power10.c
@@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
 #endif
   const float *mvecp = mvec;
   /* We have to load reverse mask for big endian.  */
-  /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; 
+#else
   __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
+
   long ytmp;
 
   __asm__
@@ -112,10 +115,25 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "xvmaddasp	38, 58, 33	\n\t"
        "xvmaddasp	39, 59, 33	\n\t"
 
-       "stxvp		48, 0(%4)	\n\t"
-       "stxvp		50, 32(%4)	\n\t"
-       "stxvp		34, 64(%4)	\n\t"
-       "stxvp		38, 96(%4)	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%4)   \n\t"
+       "stxv        49, 16(%4)  \n\t"
+       "stxv        50, 32(%4)  \n\t"
+       "stxv        51, 48(%4)  \n\t"
+       "stxv        34, 64(%4)  \n\t"
+       "stxv        35, 80(%4)  \n\t"
+       "stxv        38, 96(%4)  \n\t"
+       "stxv        39, 112(%4) \n\t"
+#else 
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		35, 64(%4)	\n\t"
+       "stxv		34, 80(%4)	\n\t"
+       "stxv		39, 96(%4)	\n\t"
+       "stxv		38, 112(%4)	\n\t"
+#endif
 
        "addi		%4, %4, 128	\n\t"
        "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
@@ -159,10 +177,25 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "xvmaddasp	38, 58, 33	\n\t"
        "xvmaddasp	39, 59, 33	\n\t"
 
-       "stxvp		48, 0(%4)	\n\t"
-       "stxvp		50, 32(%4)	\n\t"
-       "stxvp		34, 64(%4)	\n\t"
-       "stxvp		38, 96(%4)	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%4)   \n\t"
+       "stxv        49, 16(%4)  \n\t"
+       "stxv        50, 32(%4)  \n\t"
+       "stxv        51, 48(%4)  \n\t"
+       "stxv        34, 64(%4)  \n\t"
+       "stxv        35, 80(%4)  \n\t"
+       "stxv        38, 96(%4)  \n\t"
+       "stxv        39, 112(%4) \n\t"
+#else
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		35, 64(%4)	\n\t"
+       "stxv		34, 80(%4)	\n\t"
+       "stxv		39, 96(%4)	\n\t"
+       "stxv		38, 112(%4)	\n\t"
+#endif
 
      "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
      :
diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c
new file mode 100644
index 000000000..f30e1fa09
--- /dev/null
+++ b/kernel/power/ccopy_microk_power10.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL 1
+
+static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+  __asm__
+    (
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+       "lxvp		40, 128(%2)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 
+       "stxv        32, 0(%3)   \n\t"
+       "stxv        33, 16(%3)  \n\t"
+       "stxv        34, 32(%3)  \n\t"
+       "stxv        35, 48(%3)  \n\t"
+       "stxv        36, 64(%3)  \n\t"
+       "stxv        37, 80(%3)  \n\t"
+       "stxv        38, 96(%3)  \n\t"
+       "stxv        39, 112(%3) \n\t"
+#else
+       "stxv		33, 0(%3)	\n\t"
+       "stxv		32, 16(%3)	\n\t"
+       "stxv		35, 32(%3)	\n\t"
+       "stxv		34, 48(%3)	\n\t"
+       "stxv		37, 64(%3)	\n\t"
+       "stxv		36, 80(%3)	\n\t"
+       "stxv		39, 96(%3)	\n\t"
+       "stxv		38, 112(%3)	\n\t"
+#endif
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        40, 128(%3) \n\t"
+       "stxv        41, 144(%3) \n\t"
+       "stxv        42, 160(%3) \n\t"
+       "stxv        43, 176(%3) \n\t"
+       "stxv        44, 192(%3) \n\t"
+       "stxv        45, 208(%3) \n\t"
+       "stxv        46, 224(%3) \n\t"
+       "stxv        47, 240(%3) \n\t"
+#else
+       "stxv		41, 128(%3)	\n\t"
+       "stxv		40, 144(%3)	\n\t"
+       "stxv		43, 160(%3)	\n\t"
+       "stxv		42, 176(%3)	\n\t"
+       "stxv		45, 192(%3)	\n\t"
+       "stxv		44, 208(%3)	\n\t"
+       "stxv		47, 224(%3)	\n\t"
+       "stxv		46, 240(%3)	\n\t"
+#endif
+       "lxvp		40, 128(%2)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+
+       "addi		%3, %3, 256	\n\t"
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        32, 0(%3)   \n\t"
+       "stxv        33, 16(%3)  \n\t"
+       "stxv        34, 32(%3)  \n\t"
+       "stxv        35, 48(%3)  \n\t"
+       "stxv        36, 64(%3)  \n\t"
+       "stxv        37, 80(%3)  \n\t"
+       "stxv        38, 96(%3)  \n\t"
+       "stxv        39, 112(%3) \n\t"
+       "stxv        40, 128(%3) \n\t"
+       "stxv        41, 144(%3) \n\t"
+       "stxv        42, 160(%3) \n\t"
+       "stxv        43, 176(%3) \n\t"
+       "stxv        44, 192(%3) \n\t"
+       "stxv        45, 208(%3) \n\t"
+       "stxv        46, 224(%3) \n\t"
+       "stxv        47, 240(%3) \n\t"
+#else
+       "stxv		33, 0(%3)	\n\t"
+       "stxv		32, 16(%3)	\n\t"
+       "stxv		35, 32(%3)	\n\t"
+       "stxv		34, 48(%3)	\n\t"
+       "stxv		37, 64(%3)	\n\t"
+       "stxv		36, 80(%3)	\n\t"
+       "stxv		39, 96(%3)	\n\t"
+       "stxv		38, 112(%3)	\n\t"
+       "stxv		41, 128(%3)	\n\t"
+       "stxv		40, 144(%3)	\n\t"
+       "stxv		43, 160(%3)	\n\t"
+       "stxv		42, 176(%3)	\n\t"
+       "stxv		45, 192(%3)	\n\t"
+       "stxv		44, 208(%3)	\n\t"
+       "stxv		47, 224(%3)	\n\t"
+       "stxv		46, 240(%3)	\n\t"
+#endif
+     "#n=%1 x=%4=%2 y=%0=%3"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y) 	// 3
+     :
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c
index a5877cd12..41c510460 100644
--- a/kernel/power/ccopy_power10.c
+++ b/kernel/power/ccopy_power10.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if defined(__VEC__) || defined(__ALTIVEC__)
-#include "copy_microk_power10.c"
+#include "ccopy_microk_power10.c"
 #endif
 
 #ifndef HAVE_KERNEL
@@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
-		BLASLONG n1 = n & -64;
+		BLASLONG n1 = n & -32;
 		if ( n1 > 0 )
 		{
 			copy_kernel(n1, x, y);
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
index ef5e4710f..c53fe0c02 100644
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #include "common.h"
+#if defined(POWER10)
+#include "cdot_microk_power10.c"
+#else
 #ifndef HAVE_KERNEL_8
 #include <altivec.h> 
 
@@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
  
 }
 #endif
+#endif
  
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
@@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
     if ((inc_x == 1) && (inc_y == 1)) {
 
+#if defined(POWER10)
+        BLASLONG n1 = n & -16;
+#else
         BLASLONG n1 = n & -8;
+#endif
         BLASLONG j=0; 
 
         if (n1){
diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c
new file mode 100644
index 000000000..9d42559c9
--- /dev/null
+++ b/kernel/power/cdot_microk_power10.c
@@ -0,0 +1,185 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
+{
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
+#else
+  __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "xxperm          56, 48, %x7     \n\t"
+       "xxperm          57, 49, %x7     \n\t"
+       "xxperm          58, 50, %x7     \n\t"
+       "xxperm          59, 51, %x7     \n\t"
+
+       "xxperm          60, 52, %x7     \n\t"
+       "xxperm          61, 53, %x7     \n\t"
+       "xxperm          62, 54, %x7     \n\t"
+       "xxperm          63, 55, %x7     \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "xvmaddasp	34, 41, 49	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "lxvp            48, 0(%3)       \n\t"
+
+       "xvmaddasp	36, 42, 50	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "xvmaddasp	38, 43, 51	\n\t"	// x3_r * y3_r , x3_i * y3_i
+       "lxvp            50, 32(%3)      \n\t"
+
+       "xvmaddasp	33, 40, 56 	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "xvmaddasp	35, 41, 57 	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "lxvp            40, 0(%2)       \n\t"
+
+       "xvmaddasp	37, 42, 58	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "xvmaddasp	39, 43, 59	\n\t"	// x3_r * y3_i , x3_i * y3_r
+       "lxvp            42, 32(%2)      \n\t"
+
+       "xxperm          56, 48, %x7     \n\t"
+       "xxperm          57, 49, %x7     \n\t"
+       "xxperm          58, 50, %x7     \n\t"
+       "xxperm          59, 51, %x7     \n\t"
+
+       "xvmaddasp	32, 44, 52	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "xvmaddasp	34, 45, 53	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "lxvp            52, 64(%3)      \n\t"
+
+       "xvmaddasp	36, 46, 54 	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "xvmaddasp	38, 47, 55	\n\t"	// x3_r * y3_r , x3_i * y3_i
+       "lxvp            54, 96(%3)      \n\t"
+
+       "xvmaddasp	33, 44, 60	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "xvmaddasp	35, 45, 61	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "lxvp            44, 64(%2)      \n\t"
+       "xvmaddasp	37, 46, 62	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "xvmaddasp	39, 47, 63	\n\t"	// x3_r * y3_i , x3_i * y3_r
+       "lxvp            46, 96(%2)      \n\t"
+
+       "xxperm          60, 52, %x7     \n\t"
+       "xxperm          61, 53, %x7     \n\t"
+       "xxperm          62, 54, %x7     \n\t"
+       "xxperm          63, 55, %x7     \n\t"
+
+       "addi		%2, %2, 128 	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "xvmaddasp	34, 41, 49	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "xvmaddasp	36, 42, 50	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "xvmaddasp	38, 43, 51	\n\t"	// x3_r * y3_r , x3_i * y3_i
+
+       "xvmaddasp	33, 40, 56	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "xvmaddasp	35, 41, 57	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "xvmaddasp	37, 42, 58	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "xvmaddasp	39, 43, 59	\n\t"	// x3_r * y3_i , x3_i * y3_r
+
+       "xvmaddasp	32, 44, 52	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "xvmaddasp	34, 45, 53	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "xvmaddasp	36, 46, 54	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "xvmaddasp	38, 47, 55	\n\t"	// x3_r * y3_r , x3_i * y3_i
+
+       "xvmaddasp	33, 44, 60	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "xvmaddasp	35, 45, 61	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "xvmaddasp	37, 46, 62	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "xvmaddasp	39, 47, 63	\n\t"	// x3_r * y3_i , x3_i * y3_r
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		33, 33, 35	\n\t"
+       "xvaddsp		37, 37, 39	\n\t"
+
+       "xvaddsp		35, 32, 36	\n\t"
+       "xvaddsp		34, 33, 37	\n\t"
+       "xxswapd		32, 35		\n\t"
+       "xxswapd		33, 34		\n\t"
+       "xvaddsp		35, 35, 32	\n\t"
+       "xvaddsp		34, 34, 33	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xxpermdi 	34, 35, 34, 0 	\n\t"
+#else
+       "xxpermdi	34, 34, 35, 2	\n\t"
+#endif
+       "stxv		34, 0(%6)       \n\t"
+
+     "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
+     :
+       "=m" (*dot),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y),
+       "b" (dot),	// 6
+       "wa" (mask)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+}
diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S
index e04f948dd..fbd22aaad 100644
--- a/kernel/power/cgemm_kernel_power10.S
+++ b/kernel/power/cgemm_kernel_power10.S
@@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cgemm_macros_power10.S"
 
+#if (_AIX)
+.set	perm_const1, 0x0405060700010203
+.set	perm_const2, 0x0c0d0e0f08090a0b
+.set	save_permute_12, 0x1011121300010203	
+.set	save_permute_11, 0x18191a1b08090a0b
+#else
 .equ    perm_const1, 0x0405060700010203
 .equ    perm_const2, 0x0c0d0e0f08090a0b
 .equ save_permute_12, 0x0c0d0e0f1c1d1e1f
 .equ save_permute_11, 0x0405060714151617
-
+#endif
 
 
 #ifndef NEEDPARAM
@@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /*load reverse permute mask for big endian
   uint128 = 0xc0d0e0f08090a0b0405060700010203
 */ 
-		
+#if (_AIX)
+	lis	T2,	(perm_const2>>48 & 0xFFFF)
+	lis	T1,	(perm_const1>>48 & 0xFFFF)
+	lis	T3,	(save_permute_12>>48 & 0xFFFF)
+	lis	T4,	(save_permute_11>>48 & 0xFFFF)
+
+	ori	T2,	T2,	(perm_const2>>32 & 0xFFFF)
+	ori	T1,	T1,	(perm_const1>>32 & 0xFFFF)
+	ori	T3,	T3,	(save_permute_12>>32 & 0xFFFF)
+	ori	T4,	T4,	(save_permute_11>>32 & 0xFFFF)
+#else
 	lis T2, perm_const2@highest
 	lis T1, perm_const1@highest
 	lis T3, save_permute_12@highest
 	lis T4, save_permute_11@highest
-
 	
 	ori T2, T2, perm_const2@higher
 	ori T1, T1, perm_const1@higher
 	ori T3, T3, save_permute_12@higher
 	ori T4, T4, save_permute_11@higher
-
+#endif
 	
 	rldicr T2, T2, 32, 31
 	rldicr T1, T1, 32, 31
 	rldicr T3, T3, 32, 31
 	rldicr T4, T4, 32, 31 
 
+#if (_AIX)
+	oris	T2,	T2,	(perm_const2>>16 & 0xFFFF)
+	oris	T1, T1,	(perm_const1>>16 & 0xFFFF)
+	oris	T3, T3,	(save_permute_12>>16 & 0xFFFF)
+	oris	T4, T4,	(save_permute_11>>16 & 0xFFFF)
+
+	ori	T2, T2,	(perm_const2  & 0xFFFF)
+	ori	T1, T1,	(perm_const1 & 0xFFFF)
+	ori	T3, T3,	(save_permute_12 &  0xFFFF)
+	ori	T4, T4,	(save_permute_11 &  0xFFFF)	
+#else
 	oris T2, T2, perm_const2@h
 	oris T1, T1, perm_const1@h
 	oris T3, T3, save_permute_12@h
@@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ori T1, T1, perm_const1@l
 	ori T3, T3, save_permute_12@l  
 	ori T4, T4, save_permute_11@l
-
+#endif
 	
   li r0,0
   li PRE,512
diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S
index b66e93405..f75bf5dad 100644
--- a/kernel/power/cgemm_macros_power10.S
+++ b/kernel/power/cgemm_macros_power10.S
@@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 36, 34					
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	7, 36, 35
+	xvf32gerpp	6, 37, 35
+	xvf32gerpp	5, 32, 35
+	xvf32gerpp	4, 33, 35
+#else
 	xvf32gerpp	3, 36, 35
 	xvf32gerpp	2, 37, 35
 	xvf32gerpp	1, 32, 35
@@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	6, 37, 34
 	xvf32gerpp	5, 32, 34
 	xvf32gerpp	4, 33, 34
+#endif
 .endm
 
 .macro	LOAD4x8_2
@@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x8_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if __BYTE_ORDER__ ==  __ORDER_BIG_ENDIAN__
+	xvf32gerpp	3, 36, 34 
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	7, 36, 35
+	xvf32gerpp	6, 37, 35
+	xvf32gerpp	5, 32, 35
+	xvf32gerpp	4, 33, 35
+#else
 	xvf32gerpp	3, 36, 35
 	xvf32gerpp	2, 37, 35
 	xvf32gerpp	1, 32, 35
@@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	6, 37, 34
 	xvf32gerpp	5, 32, 34
 	xvf32gerpp	4, 33, 34
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
 	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
 	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
 .endif
+#if __BYTE_ORDER__ ==  __ORDER_BIG_ENDIAN__
+	xvf32gerpp	3, 42, 38
+	xvf32gerpp	2, 43, 38
+	xvf32gerpp	1, 40, 38
+	xvf32gerpp	0, 41, 38
+	xvf32gerpp	7, 42, 39
+	xvf32gerpp	6, 43, 39
+	xvf32gerpp	5, 40, 39
+	xvf32gerpp	4, 41, 39
+#else
 	xvf32gerpp	3, 42, 39
 	xvf32gerpp	2, 43, 39
 	xvf32gerpp	1, 40, 39
@@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	6, 43, 38
 	xvf32gerpp	5, 40, 38
 	xvf32gerpp	4, 41, 38
+#endif
 .if \Complete==0
 	lxvp	vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
 	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
@@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
 	/* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs5, vs12, vs4, 2
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs26, vs26, vs7
 	xvaddsp	vs27, vs27, vs5
 	xvaddsp	vs28, vs28, vs11
 	xvaddsp	vs29, vs29, vs9
 	xvaddsp	vs30, vs30, vs15
 	xvaddsp	vs31, vs31, vs13
+#else
+#if __BYTE_ORDER__ ==  __ORDER_BIG_ENDIAN__
+	xxpermdi	vs25, vs0, vs8, 1
+	xxpermdi	vs24, vs2, vs10, 1
+	xxpermdi	vs27, vs4, vs12, 1
+	xxpermdi	vs26, vs6, vs14, 1
+	xxpermdi	vs29, vs8, vs0, 1
+	xxpermdi	vs28, vs10, vs2, 1
+	xxpermdi	vs31, vs12, vs4, 1
+	xxpermdi	vs30, vs14, vs6, 1
 #else
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
@@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs28, vs2, vs10, 2
 	xxpermdi	vs31, vs4, vs12, 2
 	xxpermdi	vs30, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	MULT_APLHA_PART1    vs48, vs56, vs0, vs1
@@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1 
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs5, vs12, vs4, 2
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs32, vs32, vs3
 	xvaddsp	vs33, vs33, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs40, vs40, vs7
 	xvaddsp vs41, vs41, vs5
 	xvaddsp	vs34, vs34, vs11
 	xvaddsp	vs35, vs35, vs9
 	xvaddsp	vs42, vs42, vs15
 	xvaddsp	vs43, vs43, vs13
+#else
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	xxpermdi	vs33, vs0, vs8, 1
+	xxpermdi	vs32, vs2, vs10, 1
+	xxpermdi	vs41, vs4, vs12, 1 
+	xxpermdi	vs40, vs6, vs14, 1 
+	xxpermdi	vs35, vs8, vs0, 1 
+	xxpermdi	vs34, vs10, vs2, 1 
+	xxpermdi	vs43, vs12, vs4, 1
+	xxpermdi	vs42, vs14, vs6, 1 
 #else
 	xxpermdi	vs33, vs8, vs0, 2
 	xxpermdi	vs32, vs10, vs2, 2
@@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs34, vs2, vs10, 2
 	xxpermdi	vs43, vs4, vs12, 2
 	xxpermdi	vs42, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs32, 0(T2)
 	stxvp	vs40, 32(T2)
@@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 32, 35			
+	xvf32gerpp	2, 33, 35
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+#else
 	xvf32gerpp	3, 32, 34
 	xvf32gerpp	2, 33, 34
 	xvf32gerpp	1, 32, 35
 	xvf32gerpp	0, 33, 35
+#endif
 .endm
 
 .macro	LOAD4x4_2
@@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x4_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 32, 35			
+	xvf32gerpp	2, 33, 35
+	xvf32gerpp	1, 32, 34
+	xvf32gerpp	0, 33, 34
+#else
 	xvf32gerpp	3, 32, 34
 	xvf32gerpp	2, 33, 34
 	xvf32gerpp	1, 32, 35
 	xvf32gerpp	0, 33, 35
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
 	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	3, 36, 39	
+	xvf32gerpp	2, 37, 39
+	xvf32gerpp	1, 36, 38
+	xvf32gerpp	0, 37, 38
+#else
 	xvf32gerpp	3, 36, 38
 	xvf32gerpp	2, 37, 38
 	xvf32gerpp	1, 36, 39
 	xvf32gerpp	0, 37, 39
+#endif
 .if \Complete==0
 	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs9, vs0, vs8, 2
@@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
 	xvaddsp	vs26, vs26, vs11
@@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvaddsp	vs29, vs29, vs5
 	xvaddsp	vs30, vs30, vs15
 	xvaddsp	vs31, vs31, vs13
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs25, vs0, vs8, 1
+	xxpermdi	vs24, vs2, vs10, 1
+	xxpermdi	vs27, vs8, vs0, 1
+	xxpermdi	vs26, vs10, vs2, 1
+	xxpermdi	vs29, vs4, vs12, 1
+	xxpermdi	vs28, vs6, vs14, 1
+	xxpermdi	vs31, vs12, vs4, 1
+	xxpermdi	vs30, vs14, vs6, 1
 #else
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
@@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs28, vs14, vs6, 2
 	xxpermdi	vs31, vs4, vs12, 2
 	xxpermdi	vs30, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	stxvp	vs26, 0(T1)
@@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	1, 35, 32		
+	xvf32gerpp	0, 34, 32
+#else
 	xvf32gerpp	1, 34, 32
 	xvf32gerpp	0, 35, 32
+#endif
 .endm
 
 .macro	LOAD4x2_2
@@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x2_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	1, 35, 32		
+	xvf32gerpp	0, 34, 32
+#else
 	xvf32gerpp	1, 34, 33
 	xvf32gerpp	0, 35, 33
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	1, 37, 33		
+	xvf32gerpp	0, 36, 33
+#else
 	xvf32gerpp	1, 36, 32
 	xvf32gerpp	0, 37, 32
+#endif
 .if \Complete==0
 	lxvp	vs32, DISP4(\Index, \OffsetA)(\AREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
@@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR1
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 0
+	xxpermdi	vs9, vs2, vs10, 0
+	xxpermdi	vs3, vs8, vs0, 3
+	xxpermdi	vs11, vs10, vs2, 3
+#else
 	xxpermdi	vs1, vs8, vs0, 0
 	xxpermdi	vs9, vs10, vs2, 0
 	xxpermdi	vs3, vs0, vs8, 3
 	xxpermdi	vs11, vs2, vs10, 3
+#endif
 	xvaddsp	vs24, vs24, vs1
 	xvaddsp	vs26, vs26, vs9
 	xvaddsp	vs25, vs25, vs3
 	xvaddsp	vs27, vs27, vs11
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs24, vs0, vs8, 0
+	xxpermdi	vs26, vs2, vs10, 0
+	xxpermdi	vs25, vs8, vs0, 3
+	xxpermdi	vs27, vs10, vs2, 3
 #else
 	xxpermdi	vs24, vs8, vs0, 0
 	xxpermdi	vs26, vs10, vs2, 0
 	xxpermdi	vs25, vs0, vs8, 3
 	xxpermdi	vs27, vs2, vs10, 3
+#endif
 #endif
 	stxv	vs24, 0(CO)
 	stxv	vs25, 0(T1)
@@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .if \OffsetA != 0
 	addi  \AREG, \AREG, \OffsetA
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 34, 32		
+	xvf32gerpp	1, 35, 32
+#else
 	xvf32gerpp	    0, 35, 32
 	xvf32gerpp	    1, 34, 32
+#endif
 .endm
 
 .macro	LOAD4x1_2
@@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro	LOAD4x1_2O  OffsetA, OffsetB
 	lxv	vs32, (\OffsetA)(AO)
 	vspltisb        v6, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs33, vs32, vs38, 2		
+	xxpermdi	vs32, vs32, vs38, 0
+#else
 	xxpermdi        vs33, vs32, vs38, 0
 	xxpermdi        vs32, vs32, vs38, 2
+#endif
 	lxvp	vs34, (0+\OffsetB)(BO)
 	lxvp	vs36, (32+\OffsetB)(BO)
 .endm
@@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL4x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 34, 32	
+	xvf32gerpp	1, 35, 32
+#else
 	xvf32gerpp	    0, 35, 32
 	xvf32gerpp	    1, 34, 32
+#endif
 .if \Complete==0
 	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 36, 33	
+	xvf32gerpp	1, 37, 33
+#else
 	xvf32gerpp	    0, 37, 33
 	xvf32gerpp	    1, 36, 33
+#endif
 .if \Complete==0
 	lxv	vs32, DISP2(\Index, \OffsetA)(\AREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi        vs33, vs32, vs38, 2
+	xxpermdi        vs32, vs32, vs38, 0
+#else
 	xxpermdi        vs33, vs32, vs38, 0
 	xxpermdi        vs32, vs32, vs38, 2
+#endif
 .endif
 .if \IsLast==1
 .if \Complete==1
@@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL2x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	3, 36, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	1, 32, 34
+#else
 	xvf32gerpp	2, 37, 35
 	xvf32gerpp	3, 36, 35
 	xvf32gerpp	0, 33, 35
 	xvf32gerpp	1, 32, 35
+#endif
 
 .if \Complete==0
 	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
 	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	2, 41, 35
+	xvf32gerpp	3, 40, 35
+	xvf32gerpp	0, 39, 35
+	xvf32gerpp	1, 38, 35
+#else
 	xvf32gerpp	2, 41, 34
 	xvf32gerpp	3, 40, 34
 	xvf32gerpp	0, 39, 34
 	xvf32gerpp	1, 38, 34
+#endif
 
 .if \Complete==0
 	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
@@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR2
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs5, vs4, vs12, 1
+	xxpermdi	vs7, vs6, vs14, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs5, vs12, vs4, 2
 	xxpermdi	vs7, vs14, vs6, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs13, vs12, vs4, 1
+	xxpermdi	vs15, vs14, vs6, 1
+#else
 	xxpermdi	vs13, vs4, vs12, 2
 	xxpermdi	vs15, vs6, vs14, 2
+#endif
 	xvaddsp	vs26, vs26, vs7
 	xvaddsp	vs27, vs27, vs5
 	xvaddsp	vs28, vs28, vs11
@@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvaddsp	vs30, vs30, vs15
 	xvaddsp	vs31, vs31, vs13
 #else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs25, vs0, vs8, 1 
+	xxpermdi	vs24, vs2, vs10, 1 
+	xxpermdi	vs27, vs4, vs12, 1
+	xxpermdi	vs26, vs6, vs14, 1 
+	xxpermdi	vs29, vs8, vs0, 1 
+	xxpermdi	vs28, vs10, vs2, 1 
+	xxpermdi	vs31, vs12, vs4, 1 
+	xxpermdi	vs30, vs14, vs6, 1 
+#else 
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
 	xxpermdi	vs27, vs12, vs4, 2
@@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxpermdi	vs28, vs2, vs10, 2
 	xxpermdi	vs31, vs4, vs12, 2
 	xxpermdi	vs30, vs6, vs14, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	stxvp	vs26, 32(CO)
@@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro	KERNEL2x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 33, 34		
+	xvf32gerpp	1, 32, 34
+#else
 	xvf32gerpp	0, 33, 35
 	xvf32gerpp	1, 32, 35
+#endif
 .if \Complete==0
 	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
 .endif
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf32gerpp	0, 37, 35		
+	xvf32gerpp	1, 36, 35
+#else
 	xvf32gerpp	0, 37, 34
 	xvf32gerpp	1, 36, 34
+#endif
+
 .if \Complete==0
 	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
 	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
@@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RECONSTRUCT_PAIR1
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 1
+	xxpermdi	vs3, vs2, vs10, 1
+	xxpermdi	vs9, vs8, vs0, 1
+	xxpermdi	vs11, vs10, vs2, 1
+#else
 	xxpermdi	vs1, vs8, vs0, 2
 	xxpermdi	vs3, vs10, vs2, 2
 	xxpermdi	vs9, vs0, vs8, 2
 	xxpermdi	vs11, vs2, vs10, 2
+#endif
 	xvaddsp	vs24, vs24, vs3
 	xvaddsp	vs25, vs25, vs1
 	xvaddsp	vs26, vs26, vs11
 	xvaddsp	vs27, vs27, vs9
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs25, vs0, vs8, 1
+	xxpermdi	vs24, vs2, vs10, 1
+	xxpermdi	vs27, vs8, vs0, 1
+	xxpermdi	vs26, vs10, vs2, 1
 #else
 	xxpermdi	vs25, vs8, vs0, 2
 	xxpermdi	vs24, vs10, vs2, 2
 	xxpermdi	vs27, vs0, vs8, 2
 	xxpermdi	vs26, vs2, vs10, 2
+#endif
 #endif
 	stxvp	vs24, 0(CO)
 	stxvp	vs26, 0(T1)
@@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxperm	vs8, vs9, save_permute_1
 #ifndef TRMMKERNEL
   /* add */
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs1, vs0, vs8, 0
+	xxpermdi	vs9, vs8, vs0, 3
+#else
 	xxpermdi	vs1, vs8, vs0, 0
 	xxpermdi	vs9, vs0, vs8, 3
+#endif
 	xvaddsp	vs24, vs24, vs1
 	xvaddsp	vs26, vs26, vs9
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs24, vs0, vs8, 0
+	xxpermdi	vs26, vs8, vs0, 3
 #else
 	xxpermdi	vs24, vs8, vs0, 0
 	xxpermdi	vs26, vs0, vs8, 3
+#endif
 #endif
 	stxv	vs24, 0(CO)
 	stxv	vs26, 0(T1)
@@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs32, (0+\OffsetA)(AO)
 	lxvp	vs36, (32+\OffsetA)(AO)
 	vspltisb        v10, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs42, 2
+	xxpermdi	vs34, vs34, vs42, 0
+#else
 	xxpermdi        vs35, vs34, vs42, 0
 	xxpermdi        vs34, vs34, vs42, 2
+#endif
 	lxvp	vs38, (64+\OffsetA)(AO)
 	lxvp	vs40, (64+32+\OffsetA)(AO)
 .endm
@@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	    3, 35, 40
 .if \Complete==0
 	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs42, 2
+	xxpermdi	vs34, vs34, vs42, 0
+#else
 	xxpermdi        vs35, vs34, vs42, 0
 	xxpermdi        vs34, vs34, vs42, 2
+#endif
 	lxvp	vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
 .endif
 .if \IsLast==1
@@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART2    vs34, vs42, vs4, vs5
 	MULT_APLHA_PART2    vs35, vs43, vs6, vs7
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs0, vs1, save_permute_1            
+	xxperm	vs2, vs3, save_permute_1           
+	xxperm	vs4, vs5, save_permute_1          
+	xxperm	vs6, vs7, save_permute_1 
+#else
 	xxperm	vs0, vs1, vs28
 	xxperm	vs2, vs3, vs28
 	xxperm	vs4, vs5, vs28
 	xxperm	vs6, vs7, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs24, vs24, vs2
@@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvp	vs26, 32(CO)
 #else
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxv    vs2, 0(CO)
+	stxv    vs0, 16(CO)
+	stxv    vs6, 32(CO)
+	stxv    vs4, 48(CO)
+#else
 	stxv	vs0, 0(CO)
 	stxv	vs2, 16(CO)
 	stxv	vs4, 32(CO)
 	stxv	vs6, 48(CO)
+#endif
 #endif
 	addi  CO, CO, 64
 .endm
@@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxv	vs34, (\OffsetB)(BO)
 	lxvp	vs32, (0+\OffsetA)(AO)
 	vspltisb        v6, 0
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs38, 2			
+	xxpermdi	vs34, vs34, vs38, 0
+#else
 	xxpermdi        vs35, vs34, vs38, 0
 	xxpermdi        vs34, vs34, vs38, 2
+#endif
 	lxvp	vs36, (32+\OffsetA)(AO)
 .endm
 
@@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf32gerpp	    1, 35, 36
 .if \Complete==0
 	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxpermdi	vs35, vs34, vs38, 2		
+	xxpermdi	vs34, vs34, vs38, 0
+#else
 	xxpermdi        vs35, vs34, vs38, 0
 	xxpermdi        vs34, vs34, vs38, 2
+#endif
 	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
 .endif
 .if \IsLast==1
@@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
 	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs0, vs1, save_permute_1			
+	xxperm	vs2, vs3, save_permute_1
+#else
 	xxperm	vs0, vs1, vs28
 	xxperm	vs2, vs3, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs24, vs24, vs2
@@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvp	vs24, 0(CO)
 #else
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxv	vs2, 0(CO)
+	stxv	vs0, 16(CO)
+#else
 	stxv	vs0, 0(CO)
 	stxv	vs2, 16(CO)
+#endif
 #endif
 	addi  CO, CO, 32
 .endm
@@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
 	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs0, vs1, save_permute_1
+#else
 	xxperm	vs0, vs1, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs24, vs24, vs0
@@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	MULT_APLHA_PART1    vs32, vs40, vs37, vs1
 	MULT_APLHA_PART2    vs32, vs40, vs37, vs1
 /* reconstruct r, i pairs*/
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxperm	vs37, vs1, save_permute_1
+#else
 	xxperm	vs37, vs1, vs28
+#endif
 #ifndef TRMMKERNEL
   /* add */
 	xvaddsp	vs36, vs36, vs37
diff --git a/kernel/power/copy_microk_power10.c b/kernel/power/copy_microk_power10.c
index c90dc3785..8bca1a1e7 100644
--- a/kernel/power/copy_microk_power10.c
+++ b/kernel/power/copy_microk_power10.c
@@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
      "one%=:				\n\t"
 
        "stxvp		32, 0(%3)	\n\t"
-       "lxvp		32, 0(%2)	\n\t"
        "stxvp		34, 32(%3)	\n\t"
-       "lxvp		34, 32(%2)	\n\t"
        "stxvp		36, 64(%3)	\n\t"
-       "lxvp		36, 64(%2)	\n\t"
        "stxvp		38, 96(%3)	\n\t"
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
        "lxvp		38, 96(%2)	\n\t"
 
        "stxvp		40, 128(%3)	\n\t"
-       "lxvp		40, 128(%2)	\n\t"
        "stxvp		42, 160(%3)	\n\t"
-       "lxvp		42, 160(%2)	\n\t"
        "stxvp		44, 192(%3)	\n\t"
-       "lxvp		44, 192(%2)	\n\t"
        "stxvp		46, 224(%3)	\n\t"
+       "lxvp		40, 128(%2)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
        "lxvp		46, 224(%2)	\n\t"
 
        "stxvp		48, 256(%3)	\n\t"
-       "lxvp		48, 256(%2)	\n\t"
        "stxvp		50, 288(%3)	\n\t"
-       "lxvp		50, 288(%2)	\n\t"
        "stxvp		52, 320(%3)	\n\t"
-       "lxvp		52, 320(%2)	\n\t"
        "stxvp		54, 352(%3)	\n\t"
+       "lxvp		48, 256(%2)	\n\t"
+       "lxvp		50, 288(%2)	\n\t"
+       "lxvp		52, 320(%2)	\n\t"
        "lxvp		54, 352(%2)	\n\t"
+
        "stxvp		56, 384(%3)	\n\t"
-       "lxvp		56, 384(%2)	\n\t"
        "stxvp		58, 416(%3)	\n\t"
-       "lxvp		58, 416(%2)	\n\t"
        "stxvp		60, 448(%3)	\n\t"
-       "lxvp		60, 448(%2)	\n\t"
        "stxvp		62, 480(%3)	\n\t"
+       "lxvp		56, 384(%2)	\n\t"
+       "lxvp		58, 416(%2)	\n\t"
+       "lxvp		60, 448(%2)	\n\t"
        "lxvp		62, 480(%2)	\n\t"
 
        "addi		%3, %3, 512	\n\t"
diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c
new file mode 100644
index 000000000..d6a91f079
--- /dev/null
+++ b/kernel/power/cscal_microk_power10.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
+{
+  __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
+#else
+  __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+#endif
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "xscvdpspn	32, %x3    \n\t"
+       "xxspltw		32, 32, 0	\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmulsp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
+       "xvmulsp		49, 41, 32	\n\t"
+       "xvmulsp		50, 42, 32	\n\t"
+       "xvmulsp		51, 43, 32	\n\t"
+       "xvmulsp		52, 44, 32	\n\t"
+       "xvmulsp		53, 45, 32	\n\t"
+       "xvmulsp		54, 46, 32	\n\t"
+       "xvmulsp		55, 47, 32	\n\t"
+
+       "xxperm 34, 40, %x5 \n\t"
+       "xxperm 35, 41, %x5 \n\t"
+       "xxperm 36, 42, %x5 \n\t"
+       "xxperm 37, 43, %x5 \n\t"
+       "xxperm 38, 44, %x5 \n\t"
+       "xxperm 39, 45, %x5 \n\t"
+       "xxperm 56, 46, %x5 \n\t"
+       "xxperm 57, 47, %x5 \n\t"
+
+       "xvmulsp		34, 34, %x4	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+       "xvmulsp		35, 35, %x4	\n\t"
+
+       "lxvp		40, 128(%2)	\n\t"
+
+       "xvmulsp		36, 36, %x4	\n\t"
+       "xvmulsp		37, 37, %x4	\n\t"
+
+       "lxvp		42, 160(%2)	\n\t"
+
+       "xvmulsp		38, 38, %x4	\n\t"
+       "xvmulsp		39, 39, %x4	\n\t"
+
+       "lxvp		44, 192(%2)	\n\t"
+
+       "xvmulsp		56, 56, %x4	\n\t"
+       "xvmulsp		57, 57, %x4	\n\t"
+
+       "lxvp		46, 224(%2)	\n\t"
+
+       "xvaddsp		48, 48, 34	\n\t"
+       "xvaddsp		49, 49, 35	\n\t"
+       "xvaddsp		50, 50, 36	\n\t"
+       "xvaddsp		51, 51, 37	\n\t"
+
+       "stxvp		48, 0(%2)	\n\t"
+
+       "xvaddsp		52, 52, 38	\n\t"
+       "xvaddsp		53, 53, 39	\n\t"
+
+       "stxvp		50, 32(%2)	\n\t"
+
+       "xvaddsp		54, 54, 56	\n\t"
+       "xvaddsp		55, 55, 57	\n\t"
+
+       "stxvp		52, 64(%2)	\n\t"
+       "stxvp		54, 96(%2)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmulsp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
+       "xvmulsp		49, 41, 32	\n\t"
+       "xvmulsp		50, 42, 32	\n\t"
+       "xvmulsp		51, 43, 32	\n\t"
+       "xvmulsp		52, 44, 32	\n\t"
+       "xvmulsp		53, 45, 32	\n\t"
+       "xvmulsp		54, 46, 32	\n\t"
+       "xvmulsp		55, 47, 32	\n\t"
+
+       "xxperm 34, 40, %x5 \n\t"
+       "xxperm 35, 41, %x5 \n\t"
+       "xxperm 36, 42, %x5 \n\t"
+       "xxperm 37, 43, %x5 \n\t"
+       "xxperm 38, 44, %x5 \n\t"
+       "xxperm 39, 45, %x5 \n\t"
+       "xxperm 56, 46, %x5 \n\t"
+       "xxperm 57, 47, %x5 \n\t"
+
+
+       "xvmulsp		34, 34, %x4	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+       "xvmulsp		35, 35, %x4	\n\t"
+       "xvmulsp		36, 36, %x4	\n\t"
+       "xvmulsp		37, 37, %x4	\n\t"
+       "xvmulsp		38, 38, %x4	\n\t"
+       "xvmulsp		39, 39, %x4	\n\t"
+       "xvmulsp		56, 56, %x4	\n\t"
+       "xvmulsp		57, 57, %x4	\n\t"
+
+       "xvaddsp		48, 48, 34	\n\t"
+       "xvaddsp		49, 49, 35	\n\t"
+       "xvaddsp		50, 50, 36	\n\t"
+       "xvaddsp		51, 51, 37	\n\t"
+
+       "stxvp		48, 0(%2)	\n\t"
+
+       "xvaddsp		52, 52, 38	\n\t"
+       "xvaddsp		53, 53, 39	\n\t"
+
+       "stxvp		50, 32(%2)	\n\t"
+
+       "xvaddsp		54, 54, 56	\n\t"
+       "xvaddsp		55, 55, 57	\n\t"
+
+       "stxvp		52, 64(%2)	\n\t"
+       "stxvp		54, 96(%2)	\n\t"
+
+     "#n=%1 x=%0=%2 alpha=(%3,%4)\n"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x)		// 2
+     :
+       "f" (alpha_r),	// 3 
+       "wa" (t0),	// 4 
+       "wa" (mask)	// 5
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57"
+     );
+}
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
index 5144a2e93..4d9b9ccd6 100644
--- a/kernel/power/cswap.c
+++ b/kernel/power/cswap.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8)  || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8)  || defined(POWER9)
 #include "cswap_microk_power8.c"
+#elif defined(POWER10)
+#include "cswap_microk_power10.c"
 #endif
 #endif
 
diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c
new file mode 100644
index 000000000..2a44a9e30
--- /dev/null
+++ b/kernel/power/cswap_microk_power10.c
@@ -0,0 +1,127 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#if defined(DOUBLE)
+#define HAVE_KERNEL_16 1
+static void zswap_kernel_16 (long n, double *x, double *y)
+#else
+#define HAVE_KERNEL_32 1
+static void cswap_kernel_32 (long n, float *x, float *y)
+#endif
+{
+  __asm__
+    (
+       ".align	5		\n"
+     "one%=:				\n\t"
+       "lxvp            32, 0(%4)       \n\t"
+       "lxvp            34, 32(%4)      \n\t"
+       "lxvp            36, 64(%4)      \n\t"
+       "lxvp            38, 96(%4)      \n\t"
+
+       "lxvp            40, 128(%4)     \n\t"
+       "lxvp            42, 160(%4)     \n\t"
+       "lxvp            44, 192(%4)     \n\t"
+       "lxvp            46, 224(%4)     \n\t"
+
+       "lxvp            48, 0(%3)       \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "lxvp            56, 128(%3)     \n\t"
+       "lxvp            58, 160(%3)     \n\t"
+       "lxvp            60, 192(%3)     \n\t"
+       "lxvp            62, 224(%3)     \n\t"
+
+
+       "stxv		33, 0(%3)	\n\t"
+       "stxv		32, 16(%3)	\n\t"
+       "stxv		35, 32(%3)	\n\t"
+       "stxv		34, 48(%3)	\n\t"
+       "stxv		37, 64(%3)	\n\t"
+       "stxv		36, 80(%3)	\n\t"
+       "stxv		39, 96(%3)	\n\t"
+       "stxv		38, 112(%3)	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxv		41, 0(%3)	\n\t"
+       "stxv		40, 16(%3)	\n\t"
+       "stxv		43, 32(%3)	\n\t"
+       "stxv		42, 48(%3)	\n\t"
+       "stxv		45, 64(%3)	\n\t"
+       "stxv		44, 80(%3)	\n\t"
+       "stxv		47, 96(%3)	\n\t"
+       "stxv		46, 112(%3)	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		53, 64(%4)	\n\t"
+       "stxv		52, 80(%4)	\n\t"
+       "stxv		55, 96(%4)	\n\t"
+       "stxv		54, 112(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "stxv		57, 0(%4)	\n\t"
+       "stxv		56, 16(%4)	\n\t"
+       "stxv		59, 32(%4)	\n\t"
+       "stxv		58, 48(%4)	\n\t"
+       "stxv		61, 64(%4)	\n\t"
+       "stxv		60, 80(%4)	\n\t"
+       "stxv		63, 96(%4)	\n\t"
+       "stxv		62, 112(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+#if defined(DOUBLE)
+       "addic.		%2, %2, -16	\n\t"
+#else
+       "addic.		%2, %2, -32	\n\t"
+#endif
+       "bgt		one%=		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+}
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 999dc677a..9ed0af767 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -46,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "dasum_microk_power8.c"
+#elif defined(POWER10)
+#include "dasum_microk_power10.c"
 #endif
 #endif
 
-
 #ifndef HAVE_KERNEL_16
 
 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
@@ -110,6 +111,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{
 
+#if defined(POWER10)
+		if ( n >= 32)
+		{
+			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
+			for (i = 0; i < align; i++) {
+				sumf += ABS(x[i]);
+			}
+		}
+		n1 = (n-i) & -32;
+		if ( n1 > 0 )
+		{
+			sumf += dasum_kernel_16(n1, &x[i]);
+			i+=n1;
+		}
+#else
 		n1 = n & -16;
 		if ( n1 > 0 )
 		{
@@ -117,6 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 			sumf = dasum_kernel_16(n1, x);
 			i=n1;
 		}
+#endif
 
 		while(i < n)
 		{
diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c
new file mode 100644
index 000000000..110627fa4
--- /dev/null
+++ b/kernel/power/dasum_microk_power10.c
@@ -0,0 +1,240 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static double dasum_kernel_16 (long n, double *x)
+{
+  double sum;
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;
+  __vector double a0;
+  __vector double a1;
+  __vector double a2;
+  __vector double a3;
+  __vector double a4;
+  __vector double a5;
+  __vector double a6;
+  __vector double a7;
+
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "xxlxor		%x11, %x11, %x11	\n\t"
+       "xxlxor		%x12, %x12, %x12	\n\t"
+       "xxlxor		%x13, %x13, %x13	\n\t"
+       "xxlxor		%x14, %x14, %x14 	\n\t"
+       "xxlxor		%x15, %x15, %x15 	\n\t"
+       "xxlxor		%x16, %x16, %x16	\n\t"
+       "xxlxor		%x17, %x17, %x17	\n\t"
+       "xxlxor		%x18, %x18, %x18	\n\t"
+
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            52, 128(%2)	\n\t"
+       "lxvp            54, 160(%2)	\n\t"
+       "lxvp            56, 192(%2)	\n\t"
+       "lxvp            58, 224(%2)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvabsdp		48, 40		\n\t"
+       "xvabsdp		49, 41		\n\t"
+       "xvabsdp		50, 42		\n\t"
+       "xvabsdp		51, 43		\n\t"
+
+       "xvabsdp		%x3, 44		\n\t"
+       "xvabsdp		%x4, 45		\n\t"
+       "xvabsdp		%x5, 46		\n\t"
+       "xvabsdp		%x6, 47		\n\t"
+
+       "xvadddp		32, 32, 48	\n\t"
+       "xvadddp		33, 33, 49	\n\t"
+       "xvadddp		34, 34, 50	\n\t"
+       "xvadddp		35, 35, 51	\n\t"
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+
+       "xvadddp		36, 36, %x3	\n\t"
+       "xvadddp		37, 37, %x4	\n\t"
+       "xvadddp		38, 38, %x5	\n\t"
+       "xvadddp		39, 39, %x6	\n\t"
+
+       "xvabsdp		60, 52 		\n\t"
+       "xvabsdp		61, 53 		\n\t"
+       "xvabsdp		62, 54 		\n\t"
+       "xvabsdp		63, 55 		\n\t"
+
+       "xvabsdp		%x7, 56		\n\t"
+       "xvabsdp		%x8, 57		\n\t"
+       "xvabsdp		%x9, 58		\n\t"
+       "xvabsdp		%x10, 59	\n\t"
+
+       "xvadddp		%x11, %x11, 60	\n\t"
+       "xvadddp		%x12, %x12, 61	\n\t"
+       "xvadddp		%x13, %x13, 62	\n\t"
+       "xvadddp		%x14, %x14, 63	\n\t"
+
+       "lxvp		52, 128(%2)	\n\t"
+       "lxvp		54, 160(%2)	\n\t"
+       "lxvp		56, 192(%2)	\n\t"
+       "lxvp		58, 224(%2)	\n\t"
+       "xvadddp		%x15, %x15, %x7	\n\t"
+       "xvadddp		%x16, %x16, %x8	\n\t"
+       "xvadddp		%x17, %x17, %x9	\n\t"
+       "xvadddp		%x18, %x18, %x10	\n\t"
+       "addi		%2, %2, 256	\n\t"
+       "addic.		%1, %1, -32	\n\t"
+
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvabsdp		48, 40		\n\t"
+       "xvabsdp		49, 41		\n\t"
+       "xvabsdp		50, 42		\n\t"
+       "xvabsdp		51, 43		\n\t"
+       "xvabsdp		%x3, 44		\n\t"
+       "xvabsdp		%x4, 45		\n\t"
+       "xvabsdp		%x5, 46		\n\t"
+       "xvabsdp		%x6, 47		\n\t"
+
+       "xvadddp		32, 32, 48	\n\t"
+       "xvadddp		33, 33, 49	\n\t"
+       "xvadddp		34, 34, 50	\n\t"
+       "xvadddp		35, 35, 51	\n\t"
+       "xvadddp		36, 36, %x3	\n\t"
+       "xvadddp		37, 37, %x4	\n\t"
+       "xvadddp		38, 38, %x5	\n\t"
+       "xvadddp		39, 39, %x6	\n\t"
+
+       "xvabsdp		60, 52 		\n\t"
+       "xvabsdp		61, 53 		\n\t"
+       "xvabsdp		62, 54 		\n\t"
+       "xvabsdp		63, 55 		\n\t"
+
+       "xvabsdp		%x7, 56		\n\t"
+       "xvabsdp		%x8, 57		\n\t"
+       "xvabsdp		%x9, 58 	\n\t"
+       "xvabsdp		%x10, 59	\n\t"
+       "xvadddp		%x11, %x11, 60	\n\t"
+       "xvadddp		%x12, %x12, 61	\n\t"
+       "xvadddp		%x13, %x13, 62	\n\t"
+       "xvadddp		%x14, %x14, 63	\n\t"
+
+       "xvadddp		%x15, %x15, %x7	\n\t"
+       "xvadddp		%x16, %x16, %x8	\n\t"
+       "xvadddp		%x17, %x17, %x9	\n\t"
+       "xvadddp		%x18, %x18, %x10	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		%x11, %x11, %x12 	\n\t"
+       "xvadddp		%x13, %x13, %x14	\n\t"
+       "xvadddp		%x15, %x15, %x16	\n\t"
+       "xvadddp		%x17, %x17, %x18	\n\t"
+
+       "xvadddp		%x11, %x11, %x13	\n\t"
+       "xvadddp		%x15, %x15, %x17	\n\t"
+
+       "xvadddp		%x11, %x11, %x15	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+       "xvadddp		32, 32, %x11	\n\t"
+
+       XXSWAPD_S(33,32)
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#n=%1 x=%3=%2 sum=%0\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+     :
+       "=d" (sum),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3),	// 6
+       "=wa" (t4),	// 7
+       "=wa" (t5),	// 8
+       "=wa" (t6),	// 9
+       "=wa" (t7),	// 10
+       "=wa" (a0),	// 11
+       "=wa" (a1),	// 12
+       "=wa" (a2),	// 13
+       "=wa" (a3),	// 14
+       "=wa" (a4),	// 15
+       "=wa" (a5),	// 16
+       "=wa" (a6),	// 17
+       "=wa" (a7)	// 18
+     :
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+
+  return sum;
+}
+
+
diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c
index ebe91a80f..8640efcfd 100644
--- a/kernel/power/daxpy_power10.c
+++ b/kernel/power/daxpy_power10.c
@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-		BLASLONG n1 = n & -16;
+                if ( n >= 16 )
+                {
+                       BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+                        for (i = 0; i < align; i++) {
+                          y[i] += da * x[i] ;
+                        }
+                }
+                BLASLONG n1 = (n-i) & -16;
+                if ( n1 )
+                      daxpy_kernel_8(n1, &x[i], &y[i], da);
+
+                i += n1;
 
-		if ( n1 )
-			daxpy_kernel_8(n1, x, y, da);
-
-		i = n1;
 		while(i < n)
 		{
 
diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c
index cd10b7136..6c5eb4d77 100644
--- a/kernel/power/dcopy_power10.c
+++ b/kernel/power/dcopy_power10.c
@@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
-
-		BLASLONG n1 = n & -64;
-		if ( n1 > 0 )
+		if ( n >= 64 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+			for (i = 0; i < align; i++) {
+				y[i] = x[i] ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -64;
+		if ( n1 )
 		{
-			copy_kernel(n1, x, y);
-			i=n1;
+			copy_kernel(n1, &x[i], &y[i]);
+			i += n1;
 		}
 
 		while(i < n)
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
index b531799a6..cdd846891 100644
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 typedef __vector unsigned char  vec_t;
 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
-typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+
+#if !__has_builtin(__builtin_vsx_disassemble_pair)
+#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
+#endif
 
 #ifdef TRMMKERNEL
 #define SAVE_ACC(ACC, J)  \
@@ -184,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
-	  vec_t *rb = (vec_t *) & BO[0];
 	  __vector_pair rowB, rowB1;
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -199,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 3];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -241,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB, rowB1;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -251,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 2];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -285,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB, rowB1;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
+	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 1];
-	      rb = (vec_t *) & BO[l << 3];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
+	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
 	    }
@@ -397,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
@@ -406,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 3];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -439,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 2];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
 	    }
@@ -475,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  BLASLONG l = 0;
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & BO[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  rowB = *((__vector_pair *)((void *)&BO[0]));
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
 	      rowA = (vec_t *) & AO[l << 1];
-	      rb = (vec_t *) & BO[l << 2];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	    }
 	  SAVE_ACC (&acc0, 0);
@@ -562,11 +556,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  v4sf_t result[4];
 	  __vector_quad acc0, acc1, acc2, acc3;
 	  BLASLONG l = 0;
-	  FLOAT t[4] = { 0, 0, 0, 0 };
-	  t[0] = BO[0], t[1] = BO[1];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & t[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
@@ -574,9 +566,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
 	  for (l = 1; l < temp; l++)
 	    {
-	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
-	      rb = (vec_t *) & t[0];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      rb = (vec_t *) & BO[l << 1];
+	      __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
 	      rowA = (vec_t *) & AO[l << 3];
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -607,19 +598,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  v4sf_t result[4];
 	  __vector_quad acc0, acc1;
 	  BLASLONG l = 0;
-	  FLOAT t[4] = { 0, 0, 0, 0 };
-	  t[0] = BO[0], t[1] = BO[1];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & t[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
 	  for (l = 1; l < temp; l++)
 	    {
-	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
-	      rb = (vec_t *) & t[0];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      rb = (vec_t *) & BO[l << 1];
+	      __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
 	      rowA = (vec_t *) & AO[l << 2];
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -646,18 +634,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  v4sf_t result[4];
 	  __vector_quad acc0;
 	  BLASLONG l = 0;
-	  FLOAT t[4] = { 0, 0, 0, 0 };
-	  t[0] = BO[0], t[1] = BO[1];
 	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & t[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
 	  vec_t *rowA = (vec_t *) & AO[0];
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
 	  for (l = 1; l < temp; l++)
 	    {
-	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
-	      rb = (vec_t *) & t[0];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      rb = (vec_t *) & BO[l << 1];
+	      __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
 	      rowA = (vec_t *) & AO[l << 1];
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
 	    }
diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c
new file mode 100644
index 000000000..ecdc3e5c6
--- /dev/null
+++ b/kernel/power/dgemm_small_kernel_nn_power10.c
@@ -0,0 +1,923 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+
+#if !defined(B0)
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+2)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+3)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_2x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#else
+
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);           \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);           \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_2x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if defined(_AIX)
+#define LOAD_PAIR(pair, v0, v1)                             \
+  __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
+#else
+#define LOAD_PAIR(pair, v0, v1)                             \
+  __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
+#endif
+#else
+#define LOAD_PAIR(pair, v0, v1)                             \
+  __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
+#endif
+
+#define LOAD_A_1x8(K, M)            \
+  ra0 = vec_xl(0, A+((K)*lda)+M+0); \
+  ra1 = vec_xl(0, A+((K)*lda)+M+2); \
+  ra2 = vec_xl(0, A+((K)*lda)+M+4); \
+  ra3 = vec_xl(0, A+((K)*lda)+M+6);
+
+#define LOAD_A_1x4(K, M)            \
+  ra0 = vec_xl(0, A+((K)*lda)+M+0); \
+  ra1 = vec_xl(0, A+((K)*lda)+M+2); \
+
+#define LOAD_A_1x2(K, M)            \
+  ra0 = vec_xl(0, A+((K)*lda)+M+0);
+
+#define LOAD_A_1x1(K, M)              \
+  ra0 = vec_splats(A[((K)*lda)+M+0]);
+
+#define LOAD_BTP_8x2(N, K)          \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergeh(rb2, rb3);        \
+  LOAD_PAIR(pb0, t0, t1);           \
+  t0 = vec_mergel(rb0, rb1);        \
+  t1 = vec_mergel(rb2, rb3);        \
+  LOAD_PAIR(pb2, t0, t1);           \
+  rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
+  rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
+  rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
+  rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
+  t0 = vec_mergeh(rb4, rb5);        \
+  t1 = vec_mergeh(rb6, rb7);        \
+  LOAD_PAIR(pb1, t0, t1);           \
+  t0 = vec_mergel(rb4, rb5);        \
+  t1 = vec_mergel(rb6, rb7);        \
+  LOAD_PAIR(pb3, t0, t1);
+
+#define LOAD_BTP_8x1(N, K)                  \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
+  LOAD_PAIR(pb0, rb0, rb1);                 \
+  rb2 = vec_xor(rb2, rb2);                  \
+  rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \
+  rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \
+  rb3 = vec_xor(rb3, rb3);                  \
+  rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \
+  rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \
+  LOAD_PAIR(pb1, rb2, rb3);
+
+#define LOAD_BTP_4x2(N, K)          \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergeh(rb2, rb3);        \
+  LOAD_PAIR(pb0, t0, t1);           \
+  t0 = vec_mergel(rb0, rb1);        \
+  t1 = vec_mergel(rb2, rb3);        \
+  LOAD_PAIR(pb1, t0, t1);
+
+#define LOAD_BTP_4x1(N, K)                  \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
+  LOAD_PAIR(pb0, rb0, rb1);
+
+#define LOAD_BTP_2x2(N, K)                                  \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K+0);                         \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K+0);                         \
+  t0 = vec_mergeh(rb0, rb1);                                \
+  __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);  \
+  t1 = vec_mergel(rb0, rb1);                                \
+  __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
+
+#define LOAD_BTP_2x1(N, K)                                  \
+  rb0 = vec_xor(rb0, rb0);                                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0);                 \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);                 \
+  __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
+
+#define LOAD_B_1x1(N, K)              \
+  rb0 = vec_splats(B[((N)*ldb)+K]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
+                        a0, a1, a2, a3, a4, a5, a6, a7) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);       \
+  __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4);       \
+  __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5);       \
+  __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6);       \
+  __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)           \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                   \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#define PACK_B(pb0, pb1, offset)                            \
+  *((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \
+  *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1;
+
+#define LOAD_PACKED_B(pb0, pb1, offset)                        \
+  pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset)));  \
+  pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset)));
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  BLASLONG k2 = K & ~1;
+
+#if defined(__GNUC__) && !defined(__clang__)
+  int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
+#else
+  int has_packing = 0;
+#endif
+
+  double *packB;
+  if (has_packing) packB = (double *)malloc(K*8*sizeof(double));
+
+  vector double valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector double vbeta = vec_splats(beta);
+#endif
+
+  for (n = 0; n < n8; n += 8) {
+    for (m = 0; m < m8; m += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      if (has_packing) {
+        if (m == 0) {
+          for (k = 0; k < k2; k += 2) {
+            LOAD_A_1x8(k, m);
+            LOAD_BTP_8x2(n, k);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+            PACK_B(pb0, pb1, 0);
+            LOAD_A_1x8(k+1, m);
+            KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+            PACK_B(pb2, pb3, 8);
+          }
+          for (; k < K; k++) {
+            LOAD_A_1x8(k, m);
+            LOAD_BTP_8x1(n, k);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+            PACK_B(pb0, pb1, 0);
+          }
+        } else {
+          for (k = 0; k < k2; k += 2) {
+            LOAD_A_1x8(k, m);
+            LOAD_PACKED_B(pb0, pb1, 0);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+            LOAD_A_1x8(k+1, m);
+            LOAD_PACKED_B(pb2, pb3, 8);
+            KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+          }
+          for (; k < K; k++) {
+            LOAD_A_1x8(k, m);
+            LOAD_PACKED_B(pb0, pb1, 0);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+          }
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x8(k, m);
+          LOAD_BTP_8x2(n, k);
+          KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                          ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+          LOAD_A_1x8(k+1, m);
+          KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3,
+                          ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x8(k, m);
+          LOAD_BTP_8x1(n, k);
+          KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                          ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+2);
+      SAVE_4x2_ACC(&acc4, n+0, m+4);
+      SAVE_4x2_ACC(&acc6, n+0, m+6);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+      SAVE_4x2_ACC(&acc5, n+4, m+4);
+      SAVE_4x2_ACC(&acc7, n+4, m+6);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      if (!has_packing) {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x4(k, m);
+          LOAD_BTP_8x2(n, k);
+          KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
+          LOAD_A_1x4(k+1, m);
+          KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x4(k, m);
+          LOAD_BTP_8x1(n, k);
+          KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x4(k, m);
+          LOAD_PACKED_B(pb0, pb1, 0);
+          KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
+          LOAD_A_1x4(k+1, m);
+          LOAD_PACKED_B(pb2, pb3, 8);
+          KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x4(k, m);
+          LOAD_PACKED_B(pb0, pb1, 0);
+          KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+2);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+    }
+
+    for (; m < m2; m += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      if (!has_packing) {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x2(k, m);
+          LOAD_BTP_8x2(n, k);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+          LOAD_A_1x2(k+1, m);
+          KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x2(k, m);
+          LOAD_BTP_8x1(n, k);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x2(k, m);
+          LOAD_PACKED_B(pb0, pb1, 0);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+          LOAD_A_1x2(k+1, m);
+          LOAD_PACKED_B(pb2, pb3, 8);
+          KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x2(k, m);
+          LOAD_PACKED_B(pb0, pb1, 0);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; m < M; m++) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      if (!has_packing) {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x1(k, m);
+          LOAD_BTP_8x2(n, k);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+          LOAD_A_1x1(k+1, m);
+          KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x1(k, m);
+          LOAD_BTP_8x1(n, k);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_A_1x1(k, m);
+          LOAD_PACKED_B(pb0, pb1, 0);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+          LOAD_A_1x1(k+1, m);
+          LOAD_PACKED_B(pb2, pb3, 8);
+          KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x1(k, m);
+          LOAD_PACKED_B(pb0, pb1, 0);
+          KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x1_ACC(&acc0, n+0, m+0);
+      SAVE_4x1_ACC(&acc1, n+4, m+0);
+    }
+  }
+
+  for (; n < n4; n += 4) {
+    for (m = 0; m < m8; m += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x8(k, m);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+      SAVE_4x2_ACC(&acc2, n+0, m+4);
+      SAVE_4x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x4(k, m);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; m < m2; m += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x2(k, m);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_A_1x2(k+1, m);
+        KERNEL_MMA_1ACC(pb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; m < M; m++) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_MMA_1ACC(pb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x1_ACC(&acc0, n, m);
+    }
+  }
+
+  for (; n < n2; n += 2) {
+    for (m = 0; m < m8; m += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x8(k, m);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+      SAVE_2x2_ACC(&acc2, n+0, m+4);
+      SAVE_2x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x4(k, m);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; m < m2; m += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x2(k, m);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_A_1x2(k+1, m);
+        KERNEL_MMA_1ACC(pb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+    }
+
+    for (; m < M; m++) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_MMA_1ACC(pb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x1_ACC(&acc0, n+0, m+0);
+    }
+  }
+
+  for (; n < N; n++) {
+    for (m = 0; m < m8; m += 8) {
+      vector double result = ((vector double){0.,0.});
+      vector double result1 = ((vector double){0.,0.});
+      vector double result2 = ((vector double){0.,0.});
+      vector double result3 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+      SAVE_1x4_VSR(result2, n, m+4);
+      SAVE_1x4_VSR(result3, n, m+6);
+    }
+
+    for (; m < m4; m += 4) {
+      vector double result = ((vector double){0.,0.});
+      vector double result1 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+    }
+
+    for (; m < m2; m += 2) {
+      vector double result = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+    }
+
+    for (; m < M; m++) {
+      FLOAT result = 0.0;
+
+      for (k = 0; k < K; k++) {
+        result += A[m+k*lda] * B[n*ldb+k];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  if (has_packing) free(packB);
+
+  return 0;
+}
diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c
new file mode 100644
index 000000000..7cc8c9f6c
--- /dev/null
+++ b/kernel/power/dgemm_small_kernel_nt_power10.c
@@ -0,0 +1,581 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+
+#if !defined(B0)
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_4x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
+
+#else
+
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_4x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#define LOAD_A_1x8(K, M)          \
+  ra0 = vec_xl(0, A+(K*lda)+M+0); \
+  ra1 = vec_xl(0, A+(K*lda)+M+2); \
+  ra2 = vec_xl(0, A+(K*lda)+M+4); \
+  ra3 = vec_xl(0, A+(K*lda)+M+6);
+
+#define LOAD_A_1x4(K, M)          \
+  ra0 = vec_xl(0, A+(K*lda)+M+0); \
+  ra1 = vec_xl(0, A+(K*lda)+M+2);
+
+#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
+
+#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]);
+
+#define LOAD_BP_1x8(K, N)                                 \
+  pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
+  pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
+
+#define LOAD_BP_1x4(K, N)                                \
+  pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
+
+#define LOAD_BP_1x2(K, N)                                  \
+  t0 = vec_xl(0, B+(K*ldb)+N);                             \
+  __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
+
+#define LOAD_B_1x8(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0); \
+  rb1 = vec_xl(0, B+(K*ldb)+N+2); \
+  rb2 = vec_xl(0, B+(K*ldb)+N+4); \
+  rb3 = vec_xl(0, B+(K*ldb)+N+6); \
+
+#define LOAD_B_1x4(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0); \
+  rb1 = vec_xl(0, B+(K*ldb)+N+2);
+
+#define LOAD_B_1x2(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0);
+
+#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
+                        a0, a1, a2, a3, a4, a5, a6, a7) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);       \
+  __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4);       \
+  __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5);       \
+  __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6);       \
+  __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)           \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                   \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  vector double valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector double vbeta = vec_splats(beta);
+#endif
+
+  for (m = 0; m < m8; m += 8) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+2);
+      SAVE_4x2_ACC(&acc4, n+0, m+4);
+      SAVE_4x2_ACC(&acc6, n+0, m+6);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+      SAVE_4x2_ACC(&acc5, n+4, m+4);
+      SAVE_4x2_ACC(&acc7, n+4, m+6);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      __vector_pair pb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+      SAVE_4x2_ACC(&acc2, n+0, m+4);
+      SAVE_4x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double t0;
+      __vector_pair pb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+      SAVE_2x2_ACC(&acc2, n+0, m+4);
+      SAVE_2x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+      register vector double result2 = ((vector double){0.,0.});
+      register vector double result3 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+      SAVE_1x4_VSR(result2, n, m+4);
+      SAVE_1x4_VSR(result3, n, m+6);
+    }
+  }
+
+  for (; m < m4; m += 4) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1;
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+2);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1;
+      __vector_pair pb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1;
+      register vector double t0;
+      __vector_pair pb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+    }
+  }
+
+  for (; m < m2; m += 2) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0;
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      __vector_pair pb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double t0;
+      __vector_pair pb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+    }
+  }
+
+  for (; m < M; m++) {
+    for (n = 0; n < n8; n += 8) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+      register vector double result2 = ((vector double){0.,0.});
+      register vector double result3 = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x8(k, n);
+        KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+      SAVE_4x1_VSR(result1, n+2, m);
+      SAVE_4x1_VSR(result2, n+4, m);
+      SAVE_4x1_VSR(result3, n+6, m);
+    }
+
+    for (; n < n4; n += 4) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x4(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+      SAVE_4x1_VSR(result1, n+2, m);
+    }
+
+    for (; n < n2; n += 2) {
+      register vector double result = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x2(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      FLOAT result = 0.0;
+
+      for (k = 0; k < K; k++) {
+        result += A[k*lda+m] * B[k*ldb+n];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c
new file mode 100644
index 000000000..93a942b02
--- /dev/null
+++ b/kernel/power/dgemm_small_kernel_tn_power10.c
@@ -0,0 +1,882 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+
+#if !defined(B0)
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+2)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+3)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_2x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#else
+
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);           \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);           \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_2x1_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if defined(_AIX)
+#define LOAD_PAIR(pair, v0, v1)                             \
+  __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
+#else
+#define LOAD_PAIR(pair, v0, v1)                             \
+  __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0);
+#endif
+#else
+#define LOAD_PAIR(pair, v0, v1)                             \
+  __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1);
+#endif
+
+#define LOAD_AT_8x2(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
+  ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra0 = t0;                         \
+  ra1 = t1;                         \
+  ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
+  ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
+  t0 = vec_mergeh(ra2, ra3);        \
+  t1 = vec_mergel(ra2, ra3);        \
+  ra2 = t0;                         \
+  ra3 = t1;                         \
+  ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
+  ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
+  t0 = vec_mergeh(ra4, ra5);        \
+  t1 = vec_mergel(ra4, ra5);        \
+  ra4 = t0;                         \
+  ra5 = t1;                         \
+  ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
+  ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
+  t0 = vec_mergeh(ra6, ra7);        \
+  t1 = vec_mergel(ra6, ra7);        \
+  ra6 = t0;                         \
+  ra7 = t1;
+
+#define LOAD_AT_8x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra1 = vec_xor(ra1, ra1);                  \
+  ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
+  ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
+  ra2 = vec_xor(ra2, ra2);                  \
+  ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
+  ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
+  ra3 = vec_xor(ra3, ra3);                  \
+  ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
+  ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
+
+#define LOAD_AT_4x2(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
+  ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
+  ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
+  ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergeh(ra2, ra3);        \
+  t2 = vec_mergel(ra0, ra1);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = t0;                         \
+  ra1 = t2;                         \
+  ra2 = t1;                         \
+  ra3 = t3;
+
+#define LOAD_AT_4x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra1 = vec_xor(ra1, ra1);                  \
+  ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
+  ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
+
+#define LOAD_AT_2x2(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
+  ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra0 = t0;                         \
+  ra1 = t1;
+
+#define LOAD_AT_2x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
+
+#define LOAD_A_1x1(K, M)                \
+  ra0 = vec_splats(A[((M+0)*lda)+K+0]);
+
+#define LOAD_BTP_8x2(N, K)          \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergeh(rb2, rb3);        \
+  LOAD_PAIR(pb0, t0, t1);           \
+  t0 = vec_mergel(rb0, rb1);        \
+  t1 = vec_mergel(rb2, rb3);        \
+  LOAD_PAIR(pb2, t0, t1);           \
+  rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \
+  rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \
+  rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \
+  rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \
+  t0 = vec_mergeh(rb4, rb5);        \
+  t1 = vec_mergeh(rb6, rb7);        \
+  LOAD_PAIR(pb1, t0, t1);           \
+  t0 = vec_mergel(rb4, rb5);        \
+  t1 = vec_mergel(rb6, rb7);        \
+  LOAD_PAIR(pb3, t0, t1);
+
+#define LOAD_BTP_8x1(N, K)                  \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
+  LOAD_PAIR(pb0, rb0, rb1);                 \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \
+  LOAD_PAIR(pb1, rb0, rb1);
+
+#define LOAD_BTP_4x2(N, K)          \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergeh(rb2, rb3);        \
+  LOAD_PAIR(pb0, t0, t1);           \
+  t0 = vec_mergel(rb0, rb1);        \
+  t1 = vec_mergel(rb2, rb3);        \
+  LOAD_PAIR(pb1, t0, t1);
+
+#define LOAD_BTP_4x1(N, K)                  \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \
+  LOAD_PAIR(pb0, rb0, rb1);
+
+#define LOAD_BTP_2x2(N, K)                                  \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K+0);                         \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K+0);                         \
+  t0 = vec_mergeh(rb0, rb1);                                \
+  __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);  \
+  t1 = vec_mergel(rb0, rb1);                                \
+  __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1);
+
+#define LOAD_BTP_2x1(N, K)                                  \
+  rb0 = vec_xor(rb0, rb0);                                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0);                 \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);                 \
+  __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0);
+
+#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
+                        a0, a1, a2, a3, a4, a5, a6, a7) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);       \
+  __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4);       \
+  __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5);       \
+  __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6);       \
+  __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)           \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                   \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
+
+#define KERNEL_MMA_1ACC_(acc, b0, a0)                   \
+  __builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  BLASLONG k2 = K & ~1;
+
+  vector double valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector double vbeta = vec_splats(beta);
+#endif
+
+  for (m = 0; m < m8; m += 8) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BTP_8x2(n, k);
+        KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1,
+                        ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
+        KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3,
+                        ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
+      }
+      // workaround to avoid register spilling
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_1ACC_(acc0, pb0, ra0);
+        KERNEL_MMA_1ACC_(acc1, pb0, ra1);
+        LOAD_AT_4x1(m+4, k);
+        KERNEL_MMA_1ACC_(acc2, pb0, ra0);
+        KERNEL_MMA_1ACC_(acc3, pb0, ra1);
+        LOAD_AT_4x1(m, k);
+        LOAD_BTP_4x1(n+4, k);
+        KERNEL_MMA_1ACC_(acc4, pb0, ra0);
+        KERNEL_MMA_1ACC_(acc5, pb0, ra1);
+        LOAD_AT_4x1(m+4, k);
+        KERNEL_MMA_1ACC_(acc6, pb0, ra0);
+        KERNEL_MMA_1ACC_(acc7, pb0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+4);
+      SAVE_4x2_ACC(&acc4, n+4, m+0);
+      SAVE_4x2_ACC(&acc6, n+4, m+4);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+      SAVE_4x2_ACC(&acc3, n+0, m+6);
+      SAVE_4x2_ACC(&acc5, n+4, m+2);
+      SAVE_4x2_ACC(&acc7, n+4, m+6);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
+        KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+4);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+      SAVE_4x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
+        KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc2, n+0, m+4);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+      SAVE_2x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+      register vector double result2 = ((vector double){0.,0.});
+      register vector double result3 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+      SAVE_1x4_VSR(result2, n, m+4);
+      SAVE_1x4_VSR(result3, n, m+6);
+    }
+  }
+
+  for (; m < m4; m += 4) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BTP_8x2(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2);
+        KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BTP_8x1(n, k);
+        KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+      SAVE_4x2_ACC(&acc2, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
+        KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0, rb1;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
+        KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+    }
+  }
+
+  for (; m < m2; m += 2) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BTP_8x2(n, k);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BTP_8x1(n, k);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0, ra1;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        KERNEL_MMA_1ACC(pb1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0, ra1;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        KERNEL_MMA_1ACC(pb1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+
+      register vector double ra0, ra1;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+    }
+  }
+
+  for (; m < M; m++) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1, pb2, pb3;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_8x2(n, k);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_8x1(n, k);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x1_ACC(&acc0, n+0, m+0);
+      SAVE_4x1_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_4x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_MMA_1ACC(pb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_4x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x1_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0;
+      register vector double rb0, rb1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_2x2(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_MMA_1ACC(pb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BTP_2x1(n, k);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x1_ACC(&acc0, n+0, m+0);
+    }
+
+    for (; n < N; n++) {
+      FLOAT result = 0.0;
+
+      for (k = 0; k < K; k++) {
+        result += A[m*lda+k] * B[n*ldb+k];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c
new file mode 100644
index 000000000..b47b6201f
--- /dev/null
+++ b/kernel/power/dgemm_small_kernel_tt_power10.c
@@ -0,0 +1,829 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+
+#if !defined(B0)
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_4x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
+
+#else
+
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_2x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_4x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#define LOAD_AT_8x2(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
+  ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
+  ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
+  ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergeh(ra2, ra3);        \
+  t2 = vec_mergel(ra0, ra1);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = t0;                         \
+  ra1 = t2;                         \
+  ra2 = t1;                         \
+  ra3 = t3;                         \
+  ra4 = vec_xl(0, A+(M+4)*lda+K+0); \
+  ra5 = vec_xl(0, A+(M+5)*lda+K+0); \
+  ra6 = vec_xl(0, A+(M+6)*lda+K+0); \
+  ra7 = vec_xl(0, A+(M+7)*lda+K+0); \
+  t0 = vec_mergeh(ra4, ra5);        \
+  t1 = vec_mergeh(ra6, ra7);        \
+  t2 = vec_mergel(ra4, ra5);        \
+  t3 = vec_mergel(ra6, ra7);        \
+  ra4 = t0;                         \
+  ra5 = t2;                         \
+  ra6 = t1;                         \
+  ra7 = t3;
+
+#define LOAD_AT_8x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra1 = vec_xor(ra1, ra1);                  \
+  ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
+  ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
+  ra2 = vec_xor(ra2, ra2);                  \
+  ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \
+  ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \
+  ra3 = vec_xor(ra3, ra3);                  \
+  ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \
+  ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \
+
+#define LOAD_AT_4x2(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
+  ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
+  ra2 = vec_xl(0, A+(M+2)*lda+K+0); \
+  ra3 = vec_xl(0, A+(M+3)*lda+K+0); \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergeh(ra2, ra3);        \
+  t2 = vec_mergel(ra0, ra1);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = t0;                         \
+  ra1 = t2;                         \
+  ra2 = t1;                         \
+  ra3 = t3;
+
+#define LOAD_AT_4x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra1 = vec_xor(ra1, ra1);                  \
+  ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \
+  ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \
+
+#define LOAD_AT_2x2(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K+0); \
+  ra1 = vec_xl(0, A+(M+1)*lda+K+0); \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra0 = t0;                         \
+  ra1 = t1;
+
+#define LOAD_AT_2x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
+
+#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
+
+#define LOAD_BP_1x8(K, N)                                 \
+  pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));  \
+  pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4]));
+
+#define LOAD_BP_1x4(K, N)                                 \
+  pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0]));
+
+#define LOAD_BP_1x2(K, N)                                  \
+  t0 = vec_xl(0, B+((K)*ldb)+N);                           \
+  __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0);
+
+#define LOAD_B_1x8(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0); \
+  rb1 = vec_xl(0, B+(K*ldb)+N+2); \
+  rb2 = vec_xl(0, B+(K*ldb)+N+4); \
+  rb3 = vec_xl(0, B+(K*ldb)+N+6); \
+
+#define LOAD_B_1x4(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0); \
+  rb1 = vec_xl(0, B+(K*ldb)+N+2);
+
+#define LOAD_B_1x2(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0);
+
+#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \
+                        a0, a1, a2, a3, a4, a5, a6, a7) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);       \
+  __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4);       \
+  __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5);       \
+  __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6);       \
+  __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);       \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);       \
+  __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2);       \
+  __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)           \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \
+  __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                   \
+  __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#define PACK_A(ra0, ra1, ra2, ra3, offset) \
+  vec_xst(ra0, 0, packA+(k*8)+0+offset);   \
+  vec_xst(ra1, 0, packA+(k*8)+2+offset);   \
+  vec_xst(ra2, 0, packA+(k*8)+4+offset);   \
+  vec_xst(ra3, 0, packA+(k*8)+6+offset);
+
+#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
+  ra0 = vec_xl(0, packA+(k*8)+0+offset);          \
+  ra1 = vec_xl(0, packA+(k*8)+2+offset);          \
+  ra2 = vec_xl(0, packA+(k*8)+4+offset);          \
+  ra3 = vec_xl(0, packA+(k*8)+6+offset);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  BLASLONG k2 = K & ~1;
+
+#if defined(__GNUC__) && !defined(__clang__)
+  int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
+#else
+  int has_packing = 0;
+#endif
+
+  double *packA;
+  if (has_packing) packA = (double *)malloc(K*8*sizeof(double));
+
+  vector double valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector double vbeta = vec_splats(beta);
+#endif
+
+  for (m = 0; m < m8; m += 8) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0, pb1;
+
+      if (has_packing) {
+        if (n == 0) {
+          for (k = 0; k < k2; k += 2) {
+            LOAD_AT_8x2(m, k);
+            LOAD_BP_1x8(k, n);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
+            PACK_A(ra0, ra2, ra4, ra6, 0);
+            LOAD_BP_1x8(k+1, n);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
+            PACK_A(ra1, ra3, ra5, ra7, 8);
+          }
+          for (; k < K; k++) {
+            LOAD_AT_8x1(m, k);
+            LOAD_BP_1x8(k, n);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+            PACK_A(ra0, ra1, ra2, ra3, 0);
+          }
+        } else {
+          for (k = 0; k < k2; k += 2) {
+            LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+            LOAD_BP_1x8(k, n);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
+            LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
+            LOAD_BP_1x8(k+1, n);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
+          }
+          for (; k < K; k++) {
+            LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+            LOAD_BP_1x8(k, n);
+            KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+          }
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_AT_8x2(m, k);
+          LOAD_BP_1x8(k, n);
+          KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                          ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6);
+          LOAD_BP_1x8(k+1, n);
+          KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                          ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_8x1(m, k);
+          LOAD_BP_1x8(k, n);
+          KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1,
+                          ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+2);
+      SAVE_4x2_ACC(&acc4, n+0, m+4);
+      SAVE_4x2_ACC(&acc6, n+0, m+6);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+      SAVE_4x2_ACC(&acc5, n+4, m+4);
+      SAVE_4x2_ACC(&acc7, n+4, m+6);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0;
+
+      if (!has_packing) {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_AT_8x2(m, k);
+          LOAD_BP_1x4(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
+          LOAD_BP_1x4(k+1, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_8x1(m, k);
+          LOAD_BP_1x4(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_BP_1x4(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
+          LOAD_BP_1x4(k+1, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_BP_1x4(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+      SAVE_4x2_ACC(&acc2, n+0, m+4);
+      SAVE_4x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0;
+
+      if (!has_packing) {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_AT_8x2(m, k);
+          LOAD_BP_1x2(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
+          LOAD_BP_1x2(k+1, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_8x1(m, k);
+          LOAD_BP_1x2(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_BP_1x2(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8);
+          LOAD_BP_1x2(k+1, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_BP_1x2(k, n);
+          KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+      SAVE_2x2_ACC(&acc2, n+0, m+4);
+      SAVE_2x2_ACC(&acc3, n+0, m+6);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+      register vector double result2 = ((vector double){0.,0.});
+      register vector double result3 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double rb0;
+
+      if (!has_packing) {
+        for (k = 0; k < K; k++) {
+          LOAD_AT_8x1(m, k);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      } else {
+        for (k = 0; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+      SAVE_1x4_VSR(result2, n, m+4);
+      SAVE_1x4_VSR(result3, n, m+6);
+    }
+  }
+
+  for (; m < m4; m += 4) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2);
+        LOAD_BP_1x8(k+1, n);
+        KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc2, n+0, m+2);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc3, n+4, m+2);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
+        LOAD_BP_1x4(k+1, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1, ra2, ra3;
+      register vector double t0, t1, t2, t3;
+
+      __vector_pair pb0;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2);
+        LOAD_BP_1x2(k+1, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n+0, m+0);
+      SAVE_2x2_ACC(&acc1, n+0, m+2);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+
+      register vector double ra0, ra1;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+2);
+    }
+  }
+
+  for (; m < m2; m += 2) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector double ra0, ra1;
+      register vector double t0, t1;
+
+      __vector_pair pb0, pb1;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+        LOAD_BP_1x8(k+1, n);
+        KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BP_1x8(k, n);
+        KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0, ra1;
+      register vector double t0, t1;
+
+      __vector_pair pb0;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_BP_1x4(k+1, n);
+        KERNEL_MMA_1ACC(pb0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BP_1x4(k, n);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector double ra0, ra1;
+      register vector double t0, t1;
+
+      __vector_pair pb0;
+
+      for (k = 0; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_1ACC(pb0, ra0);
+        LOAD_BP_1x2(k+1, n);
+        KERNEL_MMA_1ACC(pb0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BP_1x2(k, n);
+        KERNEL_MMA_1ACC(pb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      vector double result[4];
+      SAVE_2x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < N; n++) {
+      register vector double result = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector double rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+    }
+  }
+
+  for (; m < M; m++) {
+    for (n = 0; n < n8; n += 8) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+      register vector double result2 = ((vector double){0.,0.});
+      register vector double result3 = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+      SAVE_4x1_VSR(result1, n+2, m);
+      SAVE_4x1_VSR(result2, n+4, m);
+      SAVE_4x1_VSR(result3, n+6, m);
+    }
+
+    for (; n < n4; n += 4) {
+      register vector double result = ((vector double){0.,0.});
+      register vector double result1 = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+      SAVE_4x1_VSR(result1, n+2, m);
+    }
+
+    for (; n < n2; n += 2) {
+      register vector double result = ((vector double){0.,0.});
+
+      register vector double ra0;
+      register vector double rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      FLOAT result = 0.0;
+
+      for (k = 0; k < K; k++) {
+        result += A[m*lda+k] * B[k*ldb+n];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  if(has_packing) free(packA);
+
+  return 0;
+}
diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c
index e47de2cb5..65743731e 100644
--- a/kernel/power/dgemv_n_microk_power10.c
+++ b/kernel/power/dgemv_n_microk_power10.c
@@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        XXSPLTD_S(32,%x9,0)	// alpha, alpha
 
        "sldi		%6, %13, 3	\n\t"	// lda * sizeof (double)
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmuldp     34, 40, 32  \n\t"   // x0 * alpha, x1 * alpha
+       "xvmuldp     35, 41, 32  \n\t"	// x2 * alpha, x3 * alpha
+#else
        "xvmuldp		34, 41, 32	\n\t"	// x0 * alpha, x1 * alpha
        "xvmuldp		35, 40, 32	\n\t"	// x2 * alpha, x3 * alpha
+#endif
 
        "add		%4, %3, %6	\n\t"	// a0 = ap, a1 = a0 + lda
        "add		%6, %6, %6	\n\t"	// 2 * lda
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       XXSPLTD_S(32,34,0)   // x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,1)   // x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,0)   // x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,1)   // x3 * alpha, x3 * alpha
+#else
        XXSPLTD_S(32,34,1)	// x0 * alpha, x0 * alpha
        XXSPLTD_S(33,34,0)	// x1 * alpha, x1 * alpha
        XXSPLTD_S(34,35,1)	// x2 * alpha, x2 * alpha
        XXSPLTD_S(35,35,0)	// x3 * alpha, x3 * alpha
-
+#endif
        "add		%5, %3, %6	\n\t"	// a2 = a0 + 2 * lda
        "add		%6, %4, %6	\n\t"	// a3 = a1 + 2 * lda
 
@@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
 
        "add		%4, %3, %10	\n\t"	// a0 = ap, a1 = a0 + lda
        "add		%10, %10, %10	\n\t"	// 2 * lda
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       XXSPLTD_S(32,34,0)       // x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,1)       // x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,0)       // x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,1)       // x3 * alpha, x3 * alpha
+       XXSPLTD_S(48,39,0)       // x6 * alpha, x6 * alpha
+       XXSPLTD_S(49,39,1)       // x7 * alpha, x7 * alpha
+       XXSPLTD_S(39,38,1)       // x5 * alpha, x5 * alpha
+       XXSPLTD_S(38,38,0)       // x4 * alpha, x4 * alpha
+#else
        XXSPLTD_S(32,34,1)       // x0 * alpha, x0 * alpha
        XXSPLTD_S(33,34,0)       // x1 * alpha, x1 * alpha
        XXSPLTD_S(34,35,1)       // x2 * alpha, x2 * alpha
@@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
        XXSPLTD_S(49,39,0)       // x7 * alpha, x7 * alpha
        XXSPLTD_S(39,38,0)       // x5 * alpha, x5 * alpha
        XXSPLTD_S(38,38,1)       // x4 * alpha, x4 * alpha
+#endif
 
        "add		%5, %3, %10	\n\t"	// a2 = a0 + 2 * lda
        "add		%6, %4, %10	\n\t"	// a3 = a1 + 2 * lda
@@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
      "one%=:				\n\t"
 
        "lxvp		36, 0( %2)	\n\t"	// y0, y1
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 40, 32      \n\t"
+       "xvmaddadp       37, 41, 32      \n\t"
+#else
        "xvmaddadp       36, 40, 34      \n\t"
        "xvmaddadp       37, 41, 34      \n\t"
+#endif
        "lxvpx		40, %3, %11	\n\t"	// a0[0], a0[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 42, 33      \n\t"
+       "xvmaddadp       37, 43, 33      \n\t"
+#else
        "xvmaddadp       36, 42, 35      \n\t"
        "xvmaddadp       37, 43, 35      \n\t"
+#endif
        "lxvpx		42, %4, %11	\n\t"	// a1[0], a1[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 44, 34      \n\t"
+       "xvmaddadp       37, 45, 34      \n\t"
+#else
        "xvmaddadp       36, 44, 32      \n\t"
        "xvmaddadp       37, 45, 32      \n\t"
+#endif
        "lxvpx		44, %5, %11	\n\t"	// a2[0], a2[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 46, 35      \n\t"
+       "xvmaddadp       37, 47, 35      \n\t"
+#else
        "xvmaddadp       36, 46, 33      \n\t"
        "xvmaddadp       37, 47, 33      \n\t"
+#endif
        "lxvpx		46, %6, %11	\n\t"	// a3[0], a3[1]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 50, 38      \n\t"
+       "xvmaddadp       37, 51, 38      \n\t"
+#else
        "xvmaddadp       36, 50, 48      \n\t"
        "xvmaddadp       37, 51, 48      \n\t"
+#endif
        "lxvpx		50, %7, %11	\n\t"	// a4[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 52, 39      \n\t"
+       "xvmaddadp       37, 53, 39      \n\t"
+#else
        "xvmaddadp       36, 52, 49      \n\t"
        "xvmaddadp       37, 53, 49      \n\t"
+#endif
        "lxvpx		52, %8, %11	\n\t"	// a5[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 54, 48      \n\t"
+       "xvmaddadp       37, 55, 48      \n\t"
+#else
        "xvmaddadp       36, 54, 38      \n\t"
        "xvmaddadp       37, 55, 38      \n\t"
+#endif
        "lxvpx		54, %9, %11	\n\t"	// a6[0]
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 56, 49      \n\t"
+       "xvmaddadp       37, 57, 49      \n\t"
+#else
        "xvmaddadp       36, 56, 39      \n\t"
        "xvmaddadp       37, 57, 39      \n\t"
+#endif
        "lxvpx		56, %10, %11	\n\t"	// a7[0]
        "addi		%11, %11, 32    \n\t"
 
@@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
      "two%=:				\n\t"
 
        "lxvp		36, 0( %2)	\n\t"	// y0, y1
+#if  (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "xvmaddadp       36, 40, 32      \n\t"
+       "xvmaddadp       37, 41, 32      \n\t"
+       "xvmaddadp       36, 42, 33      \n\t"
+       "xvmaddadp       37, 43, 33      \n\t"
+       "xvmaddadp       36, 44, 34      \n\t"
+       "xvmaddadp       37, 45, 34      \n\t"
+       "xvmaddadp       36, 46, 35      \n\t"
+       "xvmaddadp       37, 47, 35      \n\t"
+       "xvmaddadp       36, 50, 38      \n\t"
+       "xvmaddadp       37, 51, 38      \n\t"
+       "xvmaddadp       36, 52, 39      \n\t"
+       "xvmaddadp       37, 53, 39      \n\t"
+       "xvmaddadp       36, 54, 48      \n\t"
+       "xvmaddadp       37, 55, 48      \n\t"
+       "xvmaddadp       36, 56, 49      \n\t"
+       "xvmaddadp       37, 57, 49      \n\t"
+#else
        "xvmaddadp       36, 40, 34      \n\t"
        "xvmaddadp       37, 41, 34      \n\t"
        "xvmaddadp       36, 42, 35      \n\t"
@@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y
        "xvmaddadp       37, 55, 38      \n\t"
        "xvmaddadp       36, 56, 39      \n\t"
        "xvmaddadp       37, 57, 39      \n\t"
+#endif
        "stxvp		36, 0( %2)	\n\t"	// y0, y1
 
      :
diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c
index 3db4d5785..899b2a04b 100644
--- a/kernel/power/dgemv_t_power10.c
+++ b/kernel/power/dgemv_t_power10.c
@@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "lxvp 40, 32(%[y]) \n\t"
 
  
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            XXMRGHD_S(42,34,35)
+            XXMRGLD_S(43,34,35)
 
+            XXMRGHD_S(44,4,5)
+            XXMRGLD_S(45,4,5)
+#else
             XXMRGLD_S(42,35,34)
             XXMRGHD_S(43,35,34)
 
             XXMRGLD_S(44,5,4)
             XXMRGHD_S(45,5,4)
+#endif
 
             "xvadddp 42,42,43 \n\t"
 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            XXMRGHD_S(46,6,7)
+            XXMRGLD_S(47,6,7)
+#else
             XXMRGLD_S(46,7,6)
             XXMRGHD_S(47,7,6)
-
+#endif
             "xvadddp 44,44,45 \n\t"
 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            XXMRGHD_S(48,8,9)
+            XXMRGLD_S(49,8,9)
+#else
             XXMRGLD_S(48,9,8)
             XXMRGHD_S(49,9,8)
-
+#endif
             "xvadddp 46,46,47 \n\t"
-            
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            "xvmaddadp  38,42,36  \n\t"
+            "xvmaddadp  39,44,36  \n\t"
+#else
             "xvmaddadp  39,42,36  \n\t"
             "xvmaddadp  38,44,36  \n\t"
-            
+#endif
             "xvadddp 48,48,49 \n\t"
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            "xvmaddadp  41,48,36  \n\t"
+#else
             "xvmaddadp  41,46,36  \n\t"
-
+#endif
             "stxvp 38, 0(%[y]) \n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+            "xvmaddadp  40,46,36  \n\t"
+#else
             "xvmaddadp  40,48,36  \n\t" 
+#endif
             "stxvp 40, 32(%[y])  \n\t"
                  
             : [memy] "+m" (*(double (*)[8])y),
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index 951c2f9c9..2aa0b8055 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "drot_microk_power8.c"
+#elif defined(POWER10)
+#include "drot_microk_power10.c"
 #endif
 #endif
 
@@ -106,8 +108,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
 	FLOAT temp;
 
 	if ( n <= 0     )  return(0);
@@ -115,12 +115,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
+#if defined(POWER10)
+		if ( n >= 16 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+			for (i = 0; i < align; i++) {
+				temp  = c*x[i] + s*y[i] ;
+				y[i]  = c*y[i] - s*x[i] ;
+				x[i]  = temp ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -16;
+		if ( n1 > 0 )
+		{
+			drot_kernel_16(n1,&x[i], &y[i], c, s);
+			i+=n1;
+		}
+#else
 		BLASLONG n1 = n & -16;
 		if ( n1 > 0 )
 		{
-			drot_kernel_16(n1, x1, y1, c, s);
+			drot_kernel_16(n1, x, y, c, s);
 			i=n1;
 		}
+#endif
 
 		while(i < n)
 		{
diff --git a/kernel/power/drot_microk_power10.c b/kernel/power/drot_microk_power10.c
new file mode 100644
index 000000000..e34e745c7
--- /dev/null
+++ b/kernel/power/drot_microk_power10.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
+{
+  __asm__
+    (
+       XXSPLTD_S(36,%x5,0)	// load c to both dwords
+       XXSPLTD_S(37,%x6,0)	// load s to both dwords
+       "lxvp            32, 0(%3)       \n\t"   // load x
+       "lxvp            34, 32(%3)      \n\t"
+       "lxvp            48, 0(%4)       \n\t"   // load y
+       "lxvp            50, 32(%4)      \n\t"
+
+       "addic.		%2, %2, -8	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmuldp		40, 32, 36	\n\t"	// c * x
+       "xvmuldp		41, 33, 36	\n\t"
+       "xvmuldp		42, 34, 36	\n\t"
+       "xvmuldp		43, 35, 36	\n\t"
+
+       "xvmuldp		44, 32, 37	\n\t"	// s * x
+       "xvmuldp		45, 33, 37	\n\t"
+       "xvmuldp		46, 34, 37	\n\t"
+       "xvmuldp		47, 35, 37	\n\t"
+
+       "lxvp            32, 64(%3)       \n\t"   // load x
+       "lxvp            34, 96(%3)      \n\t"
+       "xvmuldp		52, 48, 36	\n\t"	// c * y
+       "xvmuldp		53, 49, 36	\n\t"
+       "xvmuldp		54, 50, 36	\n\t"
+       "xvmuldp		55, 51, 36	\n\t"
+
+       "xvmuldp		38, 48, 37	\n\t"	// s * y
+       "xvmuldp		39, 49, 37	\n\t"
+       "xvmuldp		56, 50, 37	\n\t"
+       "xvmuldp		57, 51, 37	\n\t"
+
+       "lxvp            48, 64(%4)       \n\t"   // load y
+       "lxvp            50, 96(%4)      \n\t"
+
+       "xvadddp		40, 40, 38	\n\t"	// c * x + s * y
+       "xvadddp		41, 41, 39	\n\t"	// c * x + s * y
+       "xvadddp		42, 42, 56	\n\t"	// c * x + s * y
+       "xvadddp		43, 43, 57	\n\t"	// c * x + s * y
+
+       "stxvp           40, 0(%3)       \n\t"   // store x
+       "stxvp           42, 32(%3)      \n\t"
+
+       "xvsubdp         52, 52, 44      \n\t"   // c * y - s * x
+       "xvsubdp         53, 53, 45      \n\t"   // c * y - s * x
+       "xvsubdp         54, 54, 46      \n\t"   // c * y - s * x
+       "xvsubdp         55, 55, 47      \n\t"   // c * y - s * x
+
+       "stxvp           52, 0(%4)       \n\t"   // store y
+       "stxvp           54, 32(%4)      \n\t"
+
+       "addi		%3, %3, 64	\n\t"
+       "addi		%4, %4, 64	\n\t"
+
+       "addic.		%2, %2, -8	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmuldp		40, 32, 36	\n\t"	// c * x
+       "xvmuldp		41, 33, 36	\n\t"
+       "xvmuldp		42, 34, 36	\n\t"
+       "xvmuldp		43, 35, 36	\n\t"
+
+       "xvmuldp         52, 48, 36      \n\t"   // c * y
+       "xvmuldp         53, 49, 36      \n\t"
+       "xvmuldp         54, 50, 36      \n\t"
+       "xvmuldp         55, 51, 36      \n\t"
+
+       "xvmuldp         44, 32, 37      \n\t"   // s * x
+       "xvmuldp         45, 33, 37      \n\t"
+       "xvmuldp         46, 34, 37      \n\t"
+       "xvmuldp         47, 35, 37      \n\t"
+
+       "xvmuldp         38, 48, 37     \n\t"   // s * y
+       "xvmuldp         39, 49, 37     \n\t"
+       "xvmuldp         56, 50, 37     \n\t"
+       "xvmuldp         57, 51, 37     \n\t"
+
+       "xvadddp         40, 40, 38     \n\t"   // c * x + s * y
+       "xvadddp         41, 41, 39     \n\t"   // c * x + s * y
+       "xvadddp         42, 42, 56     \n\t"   // c * x + s * y
+       "xvadddp         43, 43, 57     \n\t"   // c * x + s * y
+
+       "stxvp           40, 0(%3)       \n\t"   // store x
+       "stxvp           42, 32(%3)      \n\t"
+       "xvsubdp         52, 52, 44      \n\t"   // c * y - s * x
+       "xvsubdp         53, 53, 45      \n\t"   // c * y - s * x
+       "xvsubdp         54, 54, 46      \n\t"   // c * y - s * x
+       "xvsubdp         55, 55, 47      \n\t"   // c * y - s * x
+
+       "stxvp           52, 0(%4)       \n\t"   // store y
+       "stxvp           54, 32(%4)      \n\t"
+
+     "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y) 	// 4
+     :
+       "d" (c),		// 5 
+       "d" (s)		// 6 
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57"
+     );
+}
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index 39293252b..96c4e51bc 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "dscal_microk_power8.c"
+#elif defined(POWER10)
+#include "dscal_microk_power10.c"
 #endif
 #endif
 
@@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		
 
+#if defined(POWER10)
+			if ( n >= 16 )
+			{
+				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
+				for (j = 0; j < align; j++) {
+					x[j] = 0.0;
+				}
+			}
+			BLASLONG n1 = (n-j) & -16;
+			if ( n1 > 0 )
+			{
+				dscal_kernel_8_zero(n1, &x[j]);
+				j+=n1;
+			}
+#else
 			BLASLONG n1 = n & -16;
 			if ( n1 > 0 )
 			{
 				dscal_kernel_8_zero(n1, x);
 				j=n1;
 			}
+#endif
 
 			while(j < n)
 			{
@@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{
 
+#if defined(POWER10)
+			if ( n >= 16 )
+			{
+				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
+				for (j = 0; j < align; j++) {
+					x[j] = da * x[j];
+				}
+			}
+			BLASLONG n1 = (n-j) & -16;
+			if ( n1 > 0 )
+			{
+				dscal_kernel_8(n1, &x[j], da);
+				j+=n1;
+			}
+#else
 			BLASLONG n1 = n & -16;
 			if ( n1 > 0 )
 			{
 				dscal_kernel_8(n1, x, da);
 				j=n1;
 			}
+#endif
 			while(j < n)
 			{
 
diff --git a/kernel/power/dscal_microk_power10.c b/kernel/power/dscal_microk_power10.c
new file mode 100644
index 000000000..d0d506f24
--- /dev/null
+++ b/kernel/power/dscal_microk_power10.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void dscal_kernel_8 (long n, double *x, double alpha)
+{
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       XXSPLTD_S(48,%x3,0)
+
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmuldp		40, 32, 48	\n\t"
+       "xvmuldp		41, 33, 48	\n\t"
+       "xvmuldp		42, 34, 48	\n\t"
+       "xvmuldp		43, 35, 48	\n\t"
+       "lxvp		32, 128(%2)	\n\t"
+       "lxvp		34, 160(%2)	\n\t"
+       "xvmuldp		44, 36, 48	\n\t"
+       "xvmuldp		45, 37, 48	\n\t"
+       "xvmuldp		46, 38, 48	\n\t"
+       "xvmuldp		47, 39, 48	\n\t"
+       "lxvp		36, 192(%2)	\n\t"
+       "lxvp		38, 224(%2)	\n\t"
+
+       "stxvp		40, 0(%2)	\n\t"
+       "stxvp		42, 32(%2)	\n\t"
+       "stxvp		44, 64(%2)	\n\t"
+       "stxvp		46, 96(%2)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmuldp		40, 32, 48	\n\t"
+       "xvmuldp		41, 33, 48	\n\t"
+       "xvmuldp		42, 34, 48	\n\t"
+       "xvmuldp		43, 35, 48	\n\t"
+
+       "xvmuldp		44, 36, 48	\n\t"
+       "xvmuldp		45, 37, 48	\n\t"
+       "xvmuldp		46, 38, 48	\n\t"
+       "xvmuldp		47, 39, 48	\n\t"
+
+       "stxvp		40, 0(%2)	\n\t"
+       "stxvp		42, 32(%2)	\n\t"
+       "stxvp		44, 64(%2)	\n\t"
+       "stxvp		46, 96(%2)	\n\t"
+
+     "#n=%1 alpha=%3 x=%0=%2"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x)		// 2
+     :
+       "d" (alpha)	// 3
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
+     );
+}
+
+
+static void dscal_kernel_8_zero (long n, double *x)
+{
+
+  __asm__
+    (
+       "xxlxor		32, 32, 32	\n\t"
+       "xxlxor		33, 33, 33	\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "stxvp		32, 0(%2)	\n\t"
+       "stxvp		32, 32(%2)	\n\t"
+       "stxvp		32, 64(%2)	\n\t"
+       "stxvp		32, 96(%2)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "#n=%1 x=%0=%2 "
+     :
+       "=m" (*x),
+       "+r" (n),	// 1
+       "+b" (x)	// 2
+     :
+     :
+       "cr0","vs32","vs33"
+     );
+}
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
index ff3f95c79..9e6229c6a 100644
--- a/kernel/power/dswap.c
+++ b/kernel/power/dswap.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "dswap_microk_power8.c"
+#elif defined(POWER10)
+#include "swap_microk_power10.c"
 #endif
 #endif
 
@@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
+#if defined(POWER10)
+		if ( n >= 32 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+			for (i = 0; i < align; i++) {
+				temp = y[i];
+				y[i] = x[i];
+				x[i] = temp;
+			}
+		}
+		BLASLONG n1 = (n-i) & -32;
+		if ( n1 > 0 )
+		{
+			dswap_kernel_32(n1,&x[i], &y[i]);
+			i+=n1;
+		}
+#else
 		BLASLONG n1 = n & -32;
 		if ( n1 > 0 )
 		{
 			dswap_kernel_32(n1, x, y);
 			i=n1;
 		}
+#endif
 
 		while(i < n)
 		{
diff --git a/kernel/power/gemm_small_kernel_permit_power10.c b/kernel/power/gemm_small_kernel_permit_power10.c
new file mode 100644
index 000000000..9b38e457b
--- /dev/null
+++ b/kernel/power/gemm_small_kernel_permit_power10.c
@@ -0,0 +1,84 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+  double MNK = (double) M * (double) N * (double) K;
+
+#if defined(DOUBLE) // dgemm
+
+  // gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This
+  // issue affects both dgemm_nn and dgemm_tn.
+#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))
+  if (!transb)
+    return 0;
+#endif
+
+  if (MNK <= 54.0*54.0*54.0)
+    return 1;
+
+#else // sgemm
+
+#if defined(__GNUC__) && defined(__clang__)
+  // clang generates code with register spilling for the region of code with
+  // packing, thus, we had to disable this optimization for clang. Given that
+  // the packing on-demand used in this work is one of the reasons that lead the
+  // small kernels to outperform the normal flow (when MNK increases), with it
+  // disabled we had to reduce the MNK inputs used by the code generated by clang.
+  if (MNK > 84.0*84.0*84.0)
+    return 0;
+
+  if (transa && !transb) {
+    // sgemm_tn works better when packing on-demand is used
+    if (MNK <= 64.0*64.0*64.0 && K >= 4)
+      return 1;
+    else
+      return 0;
+  }
+  
+#else // gcc
+
+   if (MNK > 100.0*100.0*100.0)
+     return 0;
+
+#endif
+
+  // Multi-threading execution outperforms (or approaches) the execution of the
+  // small kernel.
+  if (num_cpu_avail(3) > 1) {
+    if (MNK <= 64.0*64.0*64.0)
+      return 1;
+  } else {
+    return 1;
+  }
+
+#endif
+
+  return 0;
+}
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index abc61b62e..9c6f87639 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -159,6 +159,11 @@
 #define PREFETCHSIZE_C  16
 #endif
 
+#ifdef POWER3
+#define PREFETCHSIZE_A  16
+#define PREFETCHSIZE_C  16
+#endif
+
 #ifdef POWER4
 #define PREFETCHSIZE_A  16
 #define PREFETCHSIZE_C  16
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index 25a4dd01b..accdad702 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -124,6 +124,11 @@
 #define PREFETCHSIZE_C  16
 #endif
 
+#ifdef POWER3
+#define PREFETCHSIZE_A  16
+#define PREFETCHSIZE_C  16
+#endif
+
 #ifdef POWER4
 #define PREFETCHSIZE_A  48
 #define PREFETCHSIZE_C  16
diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c
index 5016f67dd..f1ef00066 100644
--- a/kernel/power/idamax.c
+++ b/kernel/power/idamax.c
@@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
     if (inc_x == 1) {
 
-        BLASLONG n1 = n & -32;
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
 #if defined(__VEC__) || defined(__ALTIVEC__)
 
+        BLASLONG n1 = n & -32;
 	if (n1 > 0) {
 
             max = diamax_kernel_32(n1, x, &maxf);
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index 733137012..af692a7fa 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "sasum_microk_power8.c"
+#elif defined(POWER10)
+#include "sasum_microk_power10.c"
 #endif
 #endif
 
@@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if ( inc_x == 1 )
 	{
 
+#if defined(POWER10)
+		if ( n >= 32 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				sumf += ABS(x[i]);
+			}
+		}
+		n1 = (n-i) & -32;
+		if ( n1 > 0 )
+		{
+			sumf += sasum_kernel_32(n1, &x[i]);
+			i+=n1;
+		}
+#else
 		n1 = n & -32;
 		if ( n1 > 0 )
 		{
@@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 			sumf = sasum_kernel_32(n1, x);
 			i=n1;
 		}
+#endif
 
 		while(i < n)
 		{
diff --git a/kernel/power/sasum_microk_power10.c b/kernel/power/sasum_microk_power10.c
new file mode 100644
index 000000000..ea12a4264
--- /dev/null
+++ b/kernel/power/sasum_microk_power10.c
@@ -0,0 +1,153 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#define HAVE_KERNEL_32 1
+
+static float sasum_kernel_32 (long n, float *x)
+{
+  float sum;
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvabssp		48, 40		\n\t"
+       "xvabssp		49, 41		\n\t"
+       "xvabssp		50, 42		\n\t"
+       "xvabssp		51, 43		\n\t"
+       "lxvp		40, 0(%2)	\n\t"
+
+       "xvabssp		%x3, 44		\n\t"
+       "xvabssp		%x4, 45		\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+
+       "xvabssp		%x5, 46		\n\t"
+       "xvabssp		%x6, 47		\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+
+       "xvaddsp		32, 32, 48	\n\t"
+       "xvaddsp		33, 33, 49	\n\t"
+
+       "lxvp		46, 96(%2)	\n\t"
+
+       "xvaddsp		34, 34, 50	\n\t"
+       "xvaddsp		35, 35, 51	\n\t"
+       "addi		%2, %2, 128	\n\t"
+       "xvaddsp		36, 36, %x3	\n\t"
+       "xvaddsp		37, 37, %x4	\n\t"
+       "addic.		%1, %1, -32	\n\t"
+       "xvaddsp		38, 38, %x5	\n\t"
+       "xvaddsp		39, 39, %x6	\n\t"
+
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvabssp		48, 40		\n\t"
+       "xvabssp		49, 41		\n\t"
+       "xvabssp		50, 42		\n\t"
+       "xvabssp		51, 43		\n\t"
+       "xvabssp		%x3, 44		\n\t"
+       "xvabssp		%x4, 45		\n\t"
+       "xvabssp		%x5, 46		\n\t"
+       "xvabssp		%x6, 47		\n\t"
+
+       "xvaddsp		32, 32, 48	\n\t"
+       "xvaddsp		33, 33, 49	\n\t"
+       "xvaddsp		34, 34, 50	\n\t"
+       "xvaddsp		35, 35, 51	\n\t"
+       "xvaddsp		36, 36, %x3	\n\t"
+       "xvaddsp		37, 37, %x4	\n\t"
+       "xvaddsp		38, 38, %x5	\n\t"
+       "xvaddsp		39, 39, %x6	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%x0, 32		\n"
+
+     "#n=%1 x=%3=%2 sum=%0\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+     :
+       "=f" (sum),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3)	// 6
+     :
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return sum;
+}
diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c
index 8c7c22390..4a13c1f88 100644
--- a/kernel/power/saxpy_power10.c
+++ b/kernel/power/saxpy_power10.c
@@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-		BLASLONG n1 = n & -64;
-
+		if ( n >= 64 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				y[i] += da * x[i] ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -64;
 		if ( n1 )
-			saxpy_kernel_64(n1, x, y, da);
+			saxpy_kernel_64(n1, &x[i], &y[i], da);
 
-		i = n1;
+		i += n1;
 		while(i < n)
 		{
 
diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c
index d15586703..134929ec1 100644
--- a/kernel/power/sbgemm_kernel_power10.c
+++ b/kernel/power/sbgemm_kernel_power10.c
@@ -49,17 +49,11 @@ typedef __vector unsigned char  vec_t;
 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
 
-vector char mask =
-  { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe,
-  0xf
-};
-
 /* 
  * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of
  * bfloat16 floating-point values as input. Hence this
  * merging is needed on A and B matrices. 
  */
-#define MERGE_ROW(x) vec_perm(x, x, mask)
 #define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y)
 #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
 
@@ -104,6 +98,30 @@ vector char mask =
 	  rowC = (v2sf_t *) &CO[7* ldc+J]; \
           rowC[0] += result[6] * alpha;
 
+ #define  SAVE4x2_ACC_SCALAR(ACC) {                             \
+           __builtin_mma_disassemble_acc ((void *)result, ACC); \
+           res[0] = result[0] * alpha;                          \
+           res[1] = result[1] * alpha;                          \
+           res[2] = result[2] * alpha;                          \
+           res[3] = result[3] * alpha;                          \
+           CO[0 * ldc] += res[0][0];                            \
+           CO[1 * ldc] += res[1][0];                            \
+           CO[2 * ldc] += res[2][0];                            \
+           CO[3 * ldc] += res[3][0];                            \
+ }
+
+ #define  SAVE4x2_ACC1_SCALAR(ACC) {                            \
+           __builtin_mma_disassemble_acc ((void *)result, ACC); \
+           res[0] = result[0] * alpha;                          \
+           res[1] = result[1] * alpha;                          \
+           res[2] = result[2] * alpha;                          \
+           res[3] = result[3] * alpha;                          \
+           CO[4 * ldc] += res[0][0];                            \
+           CO[5 * ldc] += res[1][0];                            \
+           CO[6 * ldc] += res[2][0];                            \
+           CO[7 * ldc] += res[3][0];                            \
+}
+
 #define MMA __builtin_mma_xvbf16ger2pp
 
 #define  SAVE2x4_ACC(ACC, J)  \
@@ -179,8 +197,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 4;
 	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
 	      vec_t *rowB = (vec_t *) & (BO[l]);
-	      vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
-	      vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+	      vec_t rowB_h = MERGE_HIGH (rowB[0], vzero);
+	      vec_t rowB_l = MERGE_LOW (rowB[0], vzero);
 	      vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
 	      vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
 	      vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero);
@@ -231,8 +249,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 4;
 	      vec_t *rowA = (vec_t *) & (AO[l]);
 	      vec_t *rowB = (vec_t *) & (BO[l]);
-	      vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
-	      vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+	      vec_t rowB_h = MERGE_HIGH (rowB[0], vzero);
+	      vec_t rowB_l = MERGE_LOW (rowB[0], vzero);
 	      vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
 	      vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
 	      MMA (&acc0, rowB_h, rowA_h);
@@ -271,8 +289,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      vector short rowA =
 		{ AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
 	      vec_t *rowB = (vec_t *) & (BO[l << 1]);
-	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
-	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA);
+	      MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA);
 	    }
 	  SAVE_ACC (&acc0, 0);
 	  SAVE_ACC1 (&acc1, 0);
@@ -306,8 +324,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 2;
 	      vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 };
 	      vec_t *rowB = (vec_t *) & (BO[(l << 2)]);
-	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
-	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA);
+	      MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA);
 	    }
 	  SAVE4x2_ACC (&acc0, 0);
 	  SAVE4x2_ACC1 (&acc1, 0);
@@ -319,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	{
 	  IFLOAT *BO = B;
 	  v2sf_t *rowC;
-	  v2sf_t result[8];
+	  v4sf_t result[4], res[4];
 	  __vector_quad acc0, acc1;
 	  __builtin_mma_xxsetaccz (&acc0);
 	  __builtin_mma_xxsetaccz (&acc1);
@@ -338,11 +356,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 1;
 	      vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 };
 	      vec_t *rowB = (vec_t *) & (BO[(l << 3)]);
-	      MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
-	      MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+	      MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA);
+	      MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA);
 	    }
-	  SAVE4x2_ACC (&acc0, 0);
-	  SAVE4x2_ACC1 (&acc1, 0);
+	  SAVE4x2_ACC_SCALAR (&acc0);
+	  SAVE4x2_ACC1_SCALAR (&acc1);
 	  CO += 1;
 	  AO += k;
 	  BO += (k << 3);
@@ -387,16 +405,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 3;
 	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
 	      vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]);
-	      vec_t *rowB = (vec_t *) & (BO[l]);
-	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
-	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
-	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
-	      MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
-	      MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
-	      MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero));
-	      MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero));
-	      MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero));
-	      MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero));
+	      vector short rowB_mrg =
+		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+	      MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero));
+	      MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero));
+	      MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero));
+	      MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero));
+	      MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero));
 	    }
 
 	  SAVE_ACC (&acc0, 0);
@@ -436,12 +454,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 3;
 	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
-	      vec_t *rowB = (vec_t *) & (BO[l]);
-	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
-	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
-	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
-	      MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
-	      MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
+	      vector short rowB_mrg =
+		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+	      MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero));
 	    }
 
 	  SAVE_ACC (&acc0, 0);
@@ -475,9 +493,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 3;
 	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
 	      vec_t *rowB = (vec_t *) & (BO[l]);
-	      vec_t rowB_mrg = MERGE_ROW (rowB[0]);
-	      MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
-	      MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
+	      vector short rowB_mrg =
+		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+	      MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero));
 	    }
 	  SAVE_ACC (&acc0, 0);
 	  SAVE_ACC (&acc1, 4);
@@ -505,8 +524,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 3;
 	      vector short rowA =
 		{ AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
-	      vec_t *rowB = (vec_t *) & (BO[l]);
-	      MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+	      vector short rowB_mrg =
+		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+	      MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA);
 	    }
 	  SAVE_ACC (&acc0, 0);
 	  CO += 4;
@@ -536,8 +556,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 2;
 	      vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 };
-	      vec_t *rowB = (vec_t *) & (BO[l << 1]);
-	      MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+	      vector short rowB_mrg =
+		{ BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0,
+		BO[(l<<1) + 3], 0
+	      };
+	      MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA);
 	    }
 	  SAVE4x2_ACC (&acc0, 0);
 	  CO += 2;
@@ -548,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	{
 	  IFLOAT *BO = B;
 	  v2sf_t *rowC;
-	  v2sf_t result[8];
+	  v4sf_t result[4], res[4];
 	  __vector_quad acc0;
 	  BLASLONG l = 0;
 	  __builtin_mma_xxsetaccz (&acc0);
@@ -566,10 +589,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 1;
 	      vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 };
-	      vec_t *rowB = (vec_t *) & (BO[l << 2]);
-	      MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+	      vector short rowB_mrg =
+		{ BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0,
+		BO[(l<<2) + 3], 0
+	      };
+	      MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA);
 	    }
-	  SAVE4x2_ACC (&acc0, 0);
+	  SAVE4x2_ACC_SCALAR (&acc0);
 	  AO += k;
 	  BO += (k << 2);
 	  CO += 1;
@@ -620,14 +646,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
 	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
 	      vec_t *rowA1 = (vec_t *) & (A1[l << 3]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
-	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
-	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
-	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
-	      MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2]));
-	      MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2]));
-	      MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3]));
-	      MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3]));
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero));
+	      MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero));
+	      MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero));
+	      MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero));
+	      MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero));
 	    }
 	  SAVE2x4_ACC (&acc0, 0);
 	  SAVE2x4_ACC (&acc1, 4);
@@ -669,10 +695,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 2;
 	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
 	      vec_t *rowA = (vec_t *) & (AO[l << 3]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
-	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
-	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
-	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero ));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero));
 	    }
 	  SAVE2x4_ACC (&acc0, 0);
 	  SAVE2x4_ACC (&acc1, 4);
@@ -708,8 +734,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 2;
 	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
 	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
-	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
 	    }
 	  SAVE2x4_ACC (&acc0, 0);
 	  SAVE2x4_ACC (&acc1, 4);
@@ -740,8 +766,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 2;
 	      vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
-	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
+	      vector short rowA =
+	        { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2],
+	        0, AO[(l << 1) + 3], 0 };
+	      MMA (&acc0, (vec_t) rowB, (vec_t)(rowA));
 	    }
 	  SAVE2x4_ACC (&acc0, 0);
 	  CO += 4;
@@ -829,10 +857,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 1;
 	      vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
 	      vec_t *rowA = (vec_t *) & (AO[(l << 4)]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
-	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
-	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
-	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
+	      MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero));
+	      MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero));
 	    }
 	  rowC = (v4sf_t *) &CO[0];
 	  __builtin_mma_disassemble_acc ((void *)result, &acc0);
@@ -871,8 +899,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 		l = (k / 2) << 1;
 	      vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
 	      vec_t *rowA = (vec_t *) & (AO[(l << 3)]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
-	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
+	      MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+	      MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
 	    }
 	  rowC = (v4sf_t *) &CO[0];
 	  __builtin_mma_disassemble_acc ((void *)result, &acc0);
@@ -904,8 +932,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 1;
 	      vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
-	      vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
-	      MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
+	      vector short rowA =
+	        { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 ,
+		AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 };
+	      MMA (&acc0, (vec_t) rowB, (vec_t)(rowA));
 	    }
 	  rowC = (v4sf_t *) &CO[0];
 	  __builtin_mma_disassemble_acc ((void *)result, &acc0);
diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c
index 298a8998a..3398ce827 100644
--- a/kernel/power/scopy_power10.c
+++ b/kernel/power/scopy_power10.c
@@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
-		BLASLONG n1 = n & -128;
-		if ( n1 > 0 )
+		if ( n >= 128 )
 		{
-			copy_kernel (n1, x, y);
-			i=n1;
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				y[i] = x[i] ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -128;
+		if ( n1 )
+		{
+			copy_kernel(n1, &x[i], &y[i]);
+			i += n1;
 		}
 
 		while(i < n)
diff --git a/kernel/power/sgemm_small_kernel_nn_power10.c b/kernel/power/sgemm_small_kernel_nn_power10.c
new file mode 100644
index 000000000..59222a436
--- /dev/null
+++ b/kernel/power/sgemm_small_kernel_nn_power10.c
@@ -0,0 +1,1563 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !defined(B0)
+#define SAVE_4x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[2] = vec_madd(result[2], valpha, rc0);         \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[3] = vec_madd(result[3], valpha, rc0);         \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+2)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[2] = vec_madd(result[2], valpha, rc0);         \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+3)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[3] = vec_madd(result[3], valpha, rc0);         \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)            \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);           \
+  rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
+  rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
+  rc0 = vec_mul(rc0, vbeta);                  \
+  result = vec_madd(result, valpha, rc0);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);        \
+  C[(N+1)*ldc+M+0] = result[2];               \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)        \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);       \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
+  C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
+  C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
+
+#define SAVE_2x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
+
+#else
+
+#define SAVE_4x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);                 \
+  result[2] = vec_mul(result[2], valpha);               \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);                 \
+  result[3] = vec_mul(result[3], valpha);               \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);             \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);             \
+  result[2] = vec_mul(result[2], valpha);               \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);             \
+  result[3] = vec_mul(result[3], valpha);               \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)      \
+  result = vec_mul(result, valpha);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);  \
+  C[(N+1)*ldc+M+0] = result[2];         \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];       \
+  C[(N+2)*ldc+M] = result[2];       \
+  C[(N+3)*ldc+M] = result[3];
+
+#define SAVE_2x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC()               \
+  __builtin_mma_xxsetaccz(&acc0);
+
+#define LOAD_A_1x16(K, M)           \
+  ra0 = vec_xl(0, A+((K)*lda)+M+0); \
+  ra1 = vec_xl(0, A+((K)*lda)+M+4); \
+  ra2 = vec_xl(0, A+((K)*lda)+M+8); \
+  ra3 = vec_xl(0, A+((K)*lda)+M+12);
+
+#define LOAD_A_1x8(K, M)            \
+  ra0 = vec_xl(0, A+((K)*lda)+M+0); \
+  ra1 = vec_xl(0, A+((K)*lda)+M+4);
+
+#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+((K)*lda)+M);
+
+#define LOAD_A_2x2(K, M)                  \
+  ra0 = vec_splats(A[K*lda+M]);           \
+  ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \
+  ra0 = vec_insert(A[K*lda+M+1], ra0, 3);
+
+#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+((K)*lda)+M, 8);
+
+#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[(K)*lda+M]);
+
+#define LOAD_BT_16x4(N, K)           \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);    \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);    \
+  t0 = vec_mergeh(rb0, rb1);         \
+  t1 = vec_mergel(rb0, rb1);         \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K);    \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K);    \
+  t2 = vec_mergeh(rb2, rb3);         \
+  t3 = vec_mergel(rb2, rb3);         \
+  rb0 = vec_xxpermdi(t0, t2, 0b00);  \
+  rb1 = vec_xxpermdi(t0, t2, 0b11);  \
+  rb2 = vec_xxpermdi(t1, t3, 0b00);  \
+  rb3 = vec_xxpermdi(t1, t3, 0b11);  \
+  rb4 = vec_xl(0, B+(N+4)*ldb+K);    \
+  rb5 = vec_xl(0, B+(N+5)*ldb+K);    \
+  t0 = vec_mergeh(rb4, rb5);         \
+  t1 = vec_mergel(rb4, rb5);         \
+  rb6 = vec_xl(0, B+(N+6)*ldb+K);    \
+  rb7 = vec_xl(0, B+(N+7)*ldb+K);    \
+  t2 = vec_mergeh(rb6, rb7);         \
+  t3 = vec_mergel(rb6, rb7);         \
+  rb4 = vec_xxpermdi(t0, t2, 0b00);  \
+  rb5 = vec_xxpermdi(t0, t2, 0b11);  \
+  rb6 = vec_xxpermdi(t1, t3, 0b00);  \
+  rb7 = vec_xxpermdi(t1, t3, 0b11);  \
+  rb8 = vec_xl(0, B+(N+8)*ldb+K);    \
+  rb9 = vec_xl(0, B+(N+9)*ldb+K);    \
+  t0 = vec_mergeh(rb8, rb9);         \
+  t1 = vec_mergel(rb8, rb9);         \
+  rb10 = vec_xl(0, B+(N+10)*ldb+K);  \
+  rb11 = vec_xl(0, B+(N+11)*ldb+K);  \
+  t2 = vec_mergeh(rb10, rb11);       \
+  t3 = vec_mergel(rb10, rb11);       \
+  rb8 = vec_xxpermdi(t0, t2, 0b00);  \
+  rb9 = vec_xxpermdi(t0, t2, 0b11);  \
+  rb10 = vec_xxpermdi(t1, t3, 0b00); \
+  rb11 = vec_xxpermdi(t1, t3, 0b11); \
+  rb12 = vec_xl(0, B+(N+12)*ldb+K);  \
+  rb13 = vec_xl(0, B+(N+13)*ldb+K);  \
+  t0 = vec_mergeh(rb12, rb13);       \
+  t1 = vec_mergel(rb12, rb13);       \
+  rb14 = vec_xl(0, B+(N+14)*ldb+K);  \
+  rb15 = vec_xl(0, B+(N+15)*ldb+K);  \
+  t2 = vec_mergeh(rb14, rb15);       \
+  t3 = vec_mergel(rb14, rb15);       \
+  rb12 = vec_xxpermdi(t0, t2, 0b00); \
+  rb13 = vec_xxpermdi(t0, t2, 0b11); \
+  rb14 = vec_xxpermdi(t1, t3, 0b00); \
+  rb15 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_BT_16x2(N, K)              \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8);   \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8);   \
+  t0 = vec_mergeh(rb0, rb1);            \
+  rb2 = vec_xl_len(B+(N+2)*ldb+K, 8);   \
+  rb3 = vec_xl_len(B+(N+3)*ldb+K, 8);   \
+  t1 = vec_mergeh(rb2, rb3);            \
+  rb0 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb1 = vec_xxpermdi(t0, t1, 0b11);     \
+  rb4 = vec_xl_len(B+(N+4)*ldb+K, 8);   \
+  rb5 = vec_xl_len(B+(N+5)*ldb+K, 8);   \
+  t0 = vec_mergeh(rb4, rb5);            \
+  rb6 = vec_xl_len(B+(N+6)*ldb+K, 8);   \
+  rb7 = vec_xl_len(B+(N+7)*ldb+K, 8);   \
+  t1 = vec_mergeh(rb6, rb7);            \
+  rb2 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb3 = vec_xxpermdi(t0, t1, 0b11);     \
+  rb8 = vec_xl_len(B+(N+8)*ldb+K, 8);   \
+  rb9 = vec_xl_len(B+(N+9)*ldb+K, 8);   \
+  t0 = vec_mergeh(rb8, rb9);            \
+  rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \
+  rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \
+  t1 = vec_mergeh(rb10, rb11);          \
+  rb4 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb5 = vec_xxpermdi(t0, t1, 0b11);     \
+  rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \
+  rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \
+  t0 = vec_mergeh(rb12, rb13);          \
+  rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \
+  rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \
+  t1 = vec_mergeh(rb14, rb15);          \
+  rb6 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb7 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_BT_16x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                   \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0);  \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);  \
+  rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2);  \
+  rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3);  \
+  rb1 = vec_xor(rb1, rb1);                   \
+  rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0);  \
+  rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1);  \
+  rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2);  \
+  rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3);  \
+  rb2 = vec_xor(rb2, rb2);                   \
+  rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0);  \
+  rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1);  \
+  rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \
+  rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \
+  rb3 = vec_xor(rb3, rb3);                   \
+  rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \
+  rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \
+  rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \
+  rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3);
+
+#define LOAD_BT_8x4(N, K)           \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);   \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);   \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergel(rb0, rb1);        \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K);   \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K);   \
+  t2 = vec_mergeh(rb2, rb3);        \
+  t3 = vec_mergel(rb2, rb3);        \
+  rb0 = vec_xxpermdi(t0, t2, 0b00); \
+  rb1 = vec_xxpermdi(t0, t2, 0b11); \
+  rb2 = vec_xxpermdi(t1, t3, 0b00); \
+  rb3 = vec_xxpermdi(t1, t3, 0b11); \
+  rb4 = vec_xl(0, B+(N+4)*ldb+K);   \
+  rb5 = vec_xl(0, B+(N+5)*ldb+K);   \
+  t0 = vec_mergeh(rb4, rb5);        \
+  t1 = vec_mergel(rb4, rb5);        \
+  rb6 = vec_xl(0, B+(N+6)*ldb+K);   \
+  rb7 = vec_xl(0, B+(N+7)*ldb+K);   \
+  t2 = vec_mergeh(rb6, rb7);        \
+  t3 = vec_mergel(rb6, rb7);        \
+  rb4 = vec_xxpermdi(t0, t2, 0b00); \
+  rb5 = vec_xxpermdi(t0, t2, 0b11); \
+  rb6 = vec_xxpermdi(t1, t3, 0b00); \
+  rb7 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_BT_8x2(N, K)             \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
+  t0 = vec_mergeh(rb0, rb1);          \
+  rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
+  rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
+  t1 = vec_mergeh(rb2, rb3);          \
+  rb0 = vec_xxpermdi(t0, t1, 0b00);   \
+  rb1 = vec_xxpermdi(t0, t1, 0b11);   \
+  rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \
+  rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \
+  t0 = vec_mergeh(rb4, rb5);          \
+  rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \
+  rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \
+  t1 = vec_mergeh(rb6, rb7);          \
+  rb2 = vec_xxpermdi(t0, t1, 0b00);   \
+  rb3 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_BT_8x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
+  rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \
+  rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \
+  rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3);
+
+#define LOAD_BT_4x4(N, K)           \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);   \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);   \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergel(rb0, rb1);        \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K);   \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K);   \
+  t2 = vec_mergeh(rb2, rb3);        \
+  t3 = vec_mergel(rb2, rb3);        \
+  rb0 = vec_xxpermdi(t0, t2, 0b00); \
+  rb1 = vec_xxpermdi(t0, t2, 0b11); \
+  rb2 = vec_xxpermdi(t1, t3, 0b00); \
+  rb3 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_BT_4x2(N, K)             \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
+  t0 = vec_mergeh(rb0, rb1);          \
+  rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
+  rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
+  t1 = vec_mergeh(rb2, rb3);          \
+  rb0 = vec_xxpermdi(t0, t1, 0b00);   \
+  rb1 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_BT_4x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
+  rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3);
+
+#define LOAD_BT_2x4(N, K)                       \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);               \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);               \
+  t0 = vec_mergeh(rb0, rb1);                    \
+  t1 = vec_mergeo(rb0, rb1);                    \
+  t2 = vec_mergel(rb0, rb1);                    \
+  rb0 = t0;                                     \
+  rb1 = t1;                                     \
+  rb2 = t2;                                     \
+  rb3 = vec_xor(rb3, rb3);                      \
+  rb3 = vec_insert(vec_extract(t2,2), rb3, 0);  \
+  rb3 = vec_insert(vec_extract(t2,3), rb3, 1);
+
+#define LOAD_BT_2x2(N, K)             \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
+  t0 = vec_mergee(rb0, rb1);          \
+  t1 = vec_mergeo(rb0, rb1);          \
+  rb0 = t0;                           \
+  rb1 = t1;
+
+#define LOAD_BT_2x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);
+
+#define LOAD_B_2x2(N, K)                    \
+  rb0 = vec_splats(B[(N+0)*ldb+K]);         \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3);
+
+#define LOAD_B_2x1(N, K)                    \
+  rb0 = vec_insert(B[(n+0)*ldb+k], rb0, 0); \
+  rb0 = vec_insert(B[(n+1)*ldb+k], rb0, 1);
+
+#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7,   \
+                        a0, a1, a2, a3, a4, a5, a6, a7)   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);  \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2);  \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);  \
+  __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4);  \
+  __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5);  \
+  __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6);  \
+  __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3)   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);  \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2);  \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)                   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                           \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#define PACK_B(rb0, rb1, rb2, rb3, offset) \
+  vec_xst(rb0, 0, packB+(k*16)+0+offset);  \
+  vec_xst(rb1, 0, packB+(k*16)+4+offset);  \
+  vec_xst(rb2, 0, packB+(k*16)+8+offset);  \
+  vec_xst(rb3, 0, packB+(k*16)+12+offset);
+
+#define LOAD_PACKED_B(rb0, rb1, rb2, rb3, offset) \
+  rb0 = vec_xl(0, packB+(k*16)+0+offset);         \
+  rb1 = vec_xl(0, packB+(k*16)+4+offset);         \
+  rb2 = vec_xl(0, packB+(k*16)+8+offset);         \
+  rb3 = vec_xl(0, packB+(k*16)+12+offset);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m16 = M & ~15;
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n16 = N & ~15;
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  BLASLONG k4 = K & ~3;
+  BLASLONG k2 = K & ~1;
+
+#if defined(__GNUC__) && !defined(__clang__)
+  int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
+#else
+  int has_packing = 0;
+#endif
+
+  float *packB;
+  if (has_packing) packB = (float *)malloc(K*16*sizeof(float));
+
+  vector float valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector float vbeta = vec_splats(beta);
+#endif
+
+  for (n = 0; n < n16; n += 16) {
+    for (m = 0; m < m8; m += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
+          rb10, rb11, rb12, rb13, rb14, rb15;
+      register vector float t0, t1, t2, t3;
+
+      if (has_packing) {
+        if (m == 0) {
+          for (k = 0; k < k4; k += 4) {
+            LOAD_A_1x8(k, m);
+            LOAD_BT_16x4(n, k);
+            KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb0, rb4, rb8, rb12, 0);
+            LOAD_A_1x8(k+1, m);
+            KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb1, rb5, rb9, rb13, 16);
+            LOAD_A_1x8(k+2, m);
+            KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb2, rb6, rb10, rb14, 32);
+            LOAD_A_1x8(k+3, m);
+            KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb3, rb7, rb11, rb15, 48);
+          }
+          for (; k < k2; k += 2) {
+            LOAD_A_1x8(k, m);
+            LOAD_BT_16x2(n, k);
+            KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb0, rb2, rb4, rb6, 0);
+            LOAD_A_1x8(k+1, m);
+            KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb1, rb3, rb5, rb7, 16);
+          }
+          for (; k < K; k++) {
+            LOAD_A_1x8(k, m);
+            LOAD_BT_16x1(n, k);
+            KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            PACK_B(rb0, rb1, rb2, rb3, 0);
+          }
+        } else {
+          for (k = 0; k < k4; k += 4) {
+            LOAD_A_1x8(k, m);
+            LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0);
+            KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            LOAD_A_1x8(k+1, m);
+            LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16);
+            KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            LOAD_A_1x8(k+2, m);
+            LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32);
+            KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            LOAD_A_1x8(k+3, m);
+            LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48);
+            KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          }
+          for (; k < k2; k += 2) {
+            LOAD_A_1x8(k, m);
+            LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0);
+            KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+            LOAD_A_1x8(k+1, m);
+            LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16);
+            KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          }
+          for (; k < K; k++) {
+            LOAD_A_1x8(k, m);
+            LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0);
+            KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                            ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          }
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x8(k, m);
+          LOAD_BT_16x4(n, k);
+          KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          LOAD_A_1x8(k+1, m);
+          KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          LOAD_A_1x8(k+2, m);
+          KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          LOAD_A_1x8(k+3, m);
+          KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x8(k, m);
+          LOAD_BT_16x2(n, k);
+          KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+          LOAD_A_1x8(k+1, m);
+          KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x8(k, m);
+          LOAD_BT_16x1(n, k);
+          KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                          ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+      SAVE_4x4_ACC(&acc4, n+0, m+4);
+      SAVE_4x4_ACC(&acc5, n+4, m+4);
+      SAVE_4x4_ACC(&acc6, n+8, m+4);
+      SAVE_4x4_ACC(&acc7, n+12, m+4);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
+          rb10, rb11, rb12, rb13, rb14, rb15;
+      register vector float t0, t1, t2, t3;
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x4(k, m);
+          LOAD_BT_16x4(n, k);
+          KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+1, m);
+          KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+2, m);
+          KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+3, m);
+          KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x4(k, m);
+          LOAD_BT_16x2(n, k);
+          KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+1, m);
+          KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x4(k, m);
+          LOAD_BT_16x1(n, k);
+          KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x4(k, m);
+          LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0);
+          KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+1, m);
+          LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16);
+          KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+2, m);
+          LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32);
+          KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+3, m);
+          LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48);
+          KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x4(k, m);
+          LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0);
+          KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0);
+          LOAD_A_1x4(k+1, m);
+          LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16);
+          KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x4(k, m);
+          LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0);
+          KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; m < m2; m += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
+          rb10, rb11, rb12, rb13, rb14, rb15;
+      register vector float t0, t1, t2, t3;
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x2(k, m);
+          LOAD_BT_16x4(n, k);
+          KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+1, m);
+          KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+2, m);
+          KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+3, m);
+          KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x2(k, m);
+          LOAD_BT_16x2(n, k);
+          KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+1, m);
+          KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x2(k, m);
+          LOAD_BT_16x1(n, k);
+          KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x2(k, m);
+          LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0);
+          KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+1, m);
+          LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16);
+          KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+2, m);
+          LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32);
+          KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+3, m);
+          LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48);
+          KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x2(k, m);
+          LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0);
+          KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0);
+          LOAD_A_1x2(k+1, m);
+          LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16);
+          KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x2(k, m);
+          LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0);
+          KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc2, n+8, m+0);
+      SAVE_4x2_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; m < M; m++) {
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
+          rb10, rb11, rb12, rb13, rb14, rb15;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+      vector float result2 = ((vector float){0.,0.,0.,0.});
+      vector float result3 = ((vector float){0.,0.,0.,0.});
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x1(k, m);
+          LOAD_BT_16x4(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12);
+          LOAD_A_1x1(k+1, m);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13);
+          LOAD_A_1x1(k+2, m);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14);
+          LOAD_A_1x1(k+3, m);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x1(k, m);
+          LOAD_BT_16x2(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6);
+          LOAD_A_1x1(k+1, m);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x1(k, m);
+          LOAD_BT_16x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_A_1x1(k, m);
+          LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12);
+          LOAD_A_1x1(k+1, m);
+          LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13);
+          LOAD_A_1x1(k+2, m);
+          LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14);
+          LOAD_A_1x1(k+3, m);
+          LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_A_1x1(k, m);
+          LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6);
+          LOAD_A_1x1(k+1, m);
+          LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7);
+        }
+        for (; k < K; k++) {
+          LOAD_A_1x1(k, m);
+          LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0);
+          KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
+        }
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+      SAVE_4x1_VSR(result2, n+8, m);
+      SAVE_4x1_VSR(result3, n+12, m);
+    }
+  }
+
+  for (; n < n8; n += 8) {
+    for (m = 0; m < m16; m += 16) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_8ACC(rb0, rb4, rb0, rb4, rb0, rb4, rb0, rb4,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        LOAD_A_1x16(k+1, m);
+        KERNEL_MMA_8ACC(rb1, rb5, rb1, rb5, rb1, rb5, rb1, rb5,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        LOAD_A_1x16(k+2, m);
+        KERNEL_MMA_8ACC(rb2, rb6, rb2, rb6, rb2, rb6, rb2, rb6,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        LOAD_A_1x16(k+3, m);
+        KERNEL_MMA_8ACC(rb3, rb7, rb3, rb7, rb3, rb7, rb3, rb7,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_8ACC(rb0, rb2, rb0, rb2, rb0, rb2, rb0, rb2,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        LOAD_A_1x16(k+1, m);
+        KERNEL_MMA_8ACC(rb1, rb3, rb1, rb3, rb1, rb3, rb1, rb3,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
+                        ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc2, n+0, m+4);
+      SAVE_4x4_ACC(&acc4, n+0, m+8);
+      SAVE_4x4_ACC(&acc6, n+0, m+12);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+      SAVE_4x4_ACC(&acc5, n+4, m+8);
+      SAVE_4x4_ACC(&acc7, n+4, m+12);
+    }
+
+    for (; m < m8; m += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_4ACC(rb0, rb4, rb0, rb4, ra0, ra0, ra1, ra1);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_4ACC(rb1, rb5, rb1, rb5, ra0, ra0, ra1, ra1);
+        LOAD_A_1x8(k+2, m);
+        KERNEL_MMA_4ACC(rb2, rb6, rb2, rb6, ra0, ra0, ra1, ra1);
+        LOAD_A_1x8(k+3, m);
+        KERNEL_MMA_4ACC(rb3, rb7, rb3, rb7, ra0, ra0, ra1, ra1);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_4ACC(rb0, rb2, rb0, rb2, ra0, ra0, ra1, ra1);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_4ACC(rb1, rb3, rb1, rb3, ra0, ra0, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc2, n+0, m+4);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0);
+        LOAD_A_1x4(k+2, m);
+        KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0);
+        LOAD_A_1x4(k+3, m);
+        KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; m < m2; m += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x2(k, m);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0);
+        LOAD_A_1x2(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0);
+        LOAD_A_1x2(k+2, m);
+        KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0);
+        LOAD_A_1x2(k+3, m);
+        KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x2(k, m);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0);
+        LOAD_A_1x2(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; m < M; m++) {
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x1(k, m);
+        LOAD_BT_8x4(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5);
+        LOAD_A_1x1(k+2, m);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6);
+        LOAD_A_1x1(k+3, m);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BT_8x2(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BT_8x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+    }
+  }
+
+  for (; n < n4; n += 4) {
+    for (m = 0; m < m16; m += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+1, m);
+        KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+2, m);
+        KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+3, m);
+        KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+1, m);
+        KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+0, m+8);
+      SAVE_4x4_ACC(&acc3, n+0, m+12);
+    }
+
+    for (; m < m8; m += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1);
+        LOAD_A_1x8(k+2, m);
+        KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1);
+        LOAD_A_1x8(k+3, m);
+        KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_1ACC(rb1, ra0);
+        LOAD_A_1x4(k+2, m);
+        KERNEL_MMA_1ACC(rb2, ra0);
+        LOAD_A_1x4(k+3, m);
+        KERNEL_MMA_1ACC(rb3, ra0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_1ACC(rb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n, m);
+    }
+
+    for (; m < m2; m += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x2(k, m);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_A_1x2(k+1, m);
+        KERNEL_MMA_1ACC(rb1, ra0);
+        LOAD_A_1x2(k+2, m);
+        KERNEL_MMA_1ACC(rb2, ra0);
+        LOAD_A_1x2(k+3, m);
+        KERNEL_MMA_1ACC(rb3, ra0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x2(k, m);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_A_1x2(k+1, m);
+        KERNEL_MMA_1ACC(rb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; m < M; m++) {
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x1(k, m);
+        LOAD_BT_4x4(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_VMADD_1VSR(ra0, rb1);
+        LOAD_A_1x1(k+2, m);
+        KERNEL_VMADD_1VSR(ra0, rb2);
+        LOAD_A_1x1(k+3, m);
+        KERNEL_VMADD_1VSR(ra0, rb3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x1(k, m);
+        LOAD_BT_4x2(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_A_1x1(k+1, m);
+        KERNEL_VMADD_1VSR(ra0, rb1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_BT_4x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+    }
+  }
+
+  for (; n < n2; n += 2) {
+    for (m = 0; m < m16; m += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_2x4(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+1, m);
+        KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+2, m);
+        KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+3, m);
+        KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_2x2(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        LOAD_A_1x16(k+1, m);
+        KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x16(k, m);
+        LOAD_BT_2x1(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+      SAVE_2x4_ACC(&acc2, n, m+8);
+      SAVE_2x4_ACC(&acc3, n, m+12);
+    }
+
+    for (; m < m8; m += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_2x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1);
+        LOAD_A_1x8(k+2, m);
+        KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1);
+        LOAD_A_1x8(k+3, m);
+        KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_2x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+        LOAD_A_1x8(k+1, m);
+        KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_BT_2x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+    }
+
+    for (; m < m4; m += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_2x4(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_1ACC(rb1, ra0);
+        LOAD_A_1x4(k+2, m);
+        KERNEL_MMA_1ACC(rb2, ra0);
+        LOAD_A_1x4(k+3, m);
+        KERNEL_MMA_1ACC(rb3, ra0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_2x2(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_A_1x4(k+1, m);
+        KERNEL_MMA_1ACC(rb1, ra0);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_BT_2x1(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m);
+    }
+
+    for (; m < m2; m += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_2x2(k, m);
+        LOAD_B_2x2(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_2x2_VSR(result, n, m);
+    }
+
+    for (; m < M; m++) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0 = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_2x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_2x1_VSR(result, n, m);
+    }
+  }
+
+  for (; n < N; n++) {
+    for (m = 0; m < m16; m += 16) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+      vector float result2 = ((vector float){0.,0.,0.,0.});
+      vector float result3 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x16(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+      SAVE_1x4_VSR(result2, n, m+8);
+      SAVE_1x4_VSR(result3, n, m+12);
+    }
+
+    for (; m < m8; m += 8) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0, ra1;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+    }
+
+    for (; m < m4; m += 4) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+    }
+
+    for (; m < m2; m += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x2_VSR(result, n, m);
+    }
+
+    for (; m < M; m++) {
+      FLOAT result = 0.0f;
+
+      for (k = 0; k < K; k++) {
+        result += A[m+k*lda] * B[n*ldb+k];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  if (has_packing) free (packB);
+
+  return 0;
+}
diff --git a/kernel/power/sgemm_small_kernel_nt_power10.c b/kernel/power/sgemm_small_kernel_nt_power10.c
new file mode 100644
index 000000000..20d3c6b0e
--- /dev/null
+++ b/kernel/power/sgemm_small_kernel_nt_power10.c
@@ -0,0 +1,887 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !defined(B0)
+#define SAVE_4x4_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+2)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[2] = vec_madd(result[2], valpha, rc0);       \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);           \
+  rc0 = vec_xl_len(C+(N+3)*ldc+M, 8);                 \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[3] = vec_madd(result[3], valpha, rc0);       \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[0] = vec_madd(result[0], valpha, rc0);       \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                     \
+  rc0 = vec_mul(rc0, vbeta);                          \
+  result[1] = vec_madd(result[1], valpha, rc0);       \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)            \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);           \
+  rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
+  rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
+  rc0 = vec_mul(rc0, vbeta);                  \
+  result = vec_madd(result, valpha, rc0);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);        \
+  C[(N+1)*ldc+M+0] = result[2];               \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)        \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);       \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
+  C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
+  C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
+
+#define SAVE_2x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
+
+#else
+
+#define SAVE_4x4_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);               \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);               \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);           \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);           \
+  result[2] = vec_mul(result[2], valpha);             \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);           \
+  result[3] = vec_mul(result[3], valpha);             \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                       \
+  __builtin_mma_disassemble_acc((void *)result, ACC); \
+  result[0] = vec_mul(result[0], valpha);             \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);               \
+  result[1] = vec_mul(result[1], valpha);             \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)      \
+  result = vec_mul(result, valpha);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);  \
+  C[(N+1)*ldc+M+0] = result[2];         \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];       \
+  C[(N+2)*ldc+M] = result[2];       \
+  C[(N+3)*ldc+M] = result[3];
+
+#define SAVE_2x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#define LOAD_A_1x16(K, M)         \
+  ra0 = vec_xl(0, A+(K*lda)+M+0); \
+  ra1 = vec_xl(0, A+(K*lda)+M+4); \
+  ra2 = vec_xl(0, A+(K*lda)+M+8); \
+  ra3 = vec_xl(0, A+(K*lda)+M+12);
+
+#define LOAD_A_1x8(K, M)          \
+  ra0 = vec_xl(0, A+(K*lda)+M+0); \
+  ra1 = vec_xl(0, A+(K*lda)+M+4);
+
+#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M);
+
+#define LOAD_A_2x2(K, M)                  \
+  ra0 = vec_splats(A[K*lda+M+0]);         \
+  ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \
+  ra0 = vec_insert(A[K*lda+M+1], ra0, 3);
+
+#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8);
+
+#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]);
+
+#define LOAD_B_1x16(K, N)         \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0); \
+  rb1 = vec_xl(0, B+(K*ldb)+N+4); \
+  rb2 = vec_xl(0, B+(K*ldb)+N+8); \
+  rb3 = vec_xl(0, B+(K*ldb)+N+12);
+
+#define LOAD_B_1x8(K, N)          \
+  rb0 = vec_xl(0, B+(K*ldb)+N+0); \
+  rb1 = vec_xl(0, B+(K*ldb)+N+4);
+
+#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N);
+
+#define LOAD_B_2x2(K, N)                  \
+  rb0 = vec_splats(B[K*ldb+N]);           \
+  rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \
+  rb0 = vec_insert(B[K*ldb+N+1], rb0, 3);
+
+#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8);
+
+#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7,  \
+                        a0, a1, a2, a3, a4, a5, a6, a7)  \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \
+  __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \
+  __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \
+  __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \
+  __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3)  \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)                  \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0) \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#define PACK_A(ra0, ra1, ra2, ra3, offset) \
+  vec_xst(ra0, 0, packA+(k*16)+0+offset);  \
+  vec_xst(ra1, 0, packA+(k*16)+4+offset);  \
+  vec_xst(ra2, 0, packA+(k*16)+8+offset);  \
+  vec_xst(ra3, 0, packA+(k*16)+12+offset);
+
+#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
+  ra0 = vec_xl(0, packA+(k*16)+0+offset);         \
+  ra1 = vec_xl(0, packA+(k*16)+4+offset);         \
+  ra2 = vec_xl(0, packA+(k*16)+8+offset);         \
+  ra3 = vec_xl(0, packA+(k*16)+12+offset);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m16 = M & ~15;
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n16 = N & ~15;
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  vector float valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector float vbeta = vec_splats(beta);
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+  int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0;
+#else
+  int has_packing = 0;
+#endif
+
+  float *packA;
+  if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
+
+  for (m = 0; m < m16; m += 16) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1;
+
+      if (has_packing) {
+        if (n == 0) {
+          for (k = 0; k < K; k++) {
+            LOAD_A_1x16(k, m);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+            PACK_A(ra0, ra1, ra2, ra3, 0);
+          }
+        } else {
+          for (k = 0; k < K; k++) {
+            LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
+                            ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+          }
+        }
+      } else {
+        for (k = 0; k < K; k++) {
+          LOAD_A_1x16(k, m);
+          LOAD_B_1x8(k, n);
+          KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1,
+                          ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc2, n+0, m+4);
+      SAVE_4x4_ACC(&acc4, n+0, m+8);
+      SAVE_4x4_ACC(&acc6, n+0, m+12);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+      SAVE_4x4_ACC(&acc5, n+4, m+8);
+      SAVE_4x4_ACC(&acc7, n+4, m+12);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+
+      if (!has_packing) {
+        for (k = 0; k < K; k++) {
+          LOAD_A_1x16(k, m);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+0, m+8);
+      SAVE_4x4_ACC(&acc3, n+0, m+12);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+
+      if (!has_packing) {
+        for (k = 0; k < K; k++) {
+          LOAD_A_1x16(k, m);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+      SAVE_2x4_ACC(&acc2, n, m+8);
+      SAVE_2x4_ACC(&acc3, n, m+12);
+    }
+
+    for (; n < N; n++) {
+      vector float result = ((vector float){0., 0., 0., 0.});
+      vector float result1 = ((vector float){0., 0., 0., 0.});
+      vector float result2 = ((vector float){0., 0., 0., 0.});
+      vector float result3 = ((vector float){0., 0., 0., 0.});
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+
+      if (!has_packing) {
+        for (k = 0; k < K; k++) {
+          LOAD_A_1x16(k, m);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      } else {
+        for (k = 0; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+      SAVE_1x4_VSR(result2, n, m+8);
+      SAVE_1x4_VSR(result3, n, m+12);
+    }
+  }
+
+  for (; m < m8; m += 8) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc4, n+0, m+4);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc5, n+4, m+4);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc6, n+8, m+4);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+      SAVE_4x4_ACC(&acc7, n+12, m+4);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc2, n+0, m+4);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+    }
+
+    for (; n < N; n++) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0, ra1;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x8(k, m);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+    }
+  }
+
+  for (; m < m4; m += 4) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m);
+    }
+
+    for (; n < N; n++) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x4(k, m);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+    }
+  }
+
+  for (; m < m2; m += 2) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc2, n+8, m+0);
+      SAVE_4x2_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0;
+      register vector float rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+    }
+
+    for (; n < n2; n += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_2x2(k, m);
+        LOAD_B_2x2(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_2x2_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x2(k, m);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x2_VSR(result, n, m);
+    }
+  }
+
+  for (; m < M; m++) {
+    for (n = 0; n < n16; n += 16) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+      vector float result2 = ((vector float){0.,0.,0.,0.});
+      vector float result3 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x16(k, n);
+        KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+      SAVE_4x1_VSR(result2, n+8, m);
+      SAVE_4x1_VSR(result3, n+12, m);
+    }
+
+    for (; n < n8; n += 8) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x8(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+    }
+
+    for (; n < n4; n += 4) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x4(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+    }
+
+    for (; n < n2; n += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(k, m);
+        LOAD_B_1x2(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_2x1_VSR(result, n+0, m);
+    }
+
+    for (; n < N; n++) {
+      FLOAT result = 0.0f;
+
+      for (k = 0; k < K; k++) {
+        result += A[k*lda+m] * B[k*ldb+n];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  if (has_packing) free (packA);
+
+  return 0;
+}
diff --git a/kernel/power/sgemm_small_kernel_tn_power10.c b/kernel/power/sgemm_small_kernel_tn_power10.c
new file mode 100644
index 000000000..64ecddbba
--- /dev/null
+++ b/kernel/power/sgemm_small_kernel_tn_power10.c
@@ -0,0 +1,1678 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !defined(B0)
+#define SAVE_4x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[2] = vec_madd(result[2], valpha, rc0);         \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[3] = vec_madd(result[3], valpha, rc0);         \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+2)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[2] = vec_madd(result[2], valpha, rc0);         \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+3)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[3] = vec_madd(result[3], valpha, rc0);         \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)            \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);           \
+  rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
+  rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
+  rc0 = vec_mul(rc0, vbeta);                  \
+  result = vec_madd(result, valpha, rc0);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);        \
+  C[(N+1)*ldc+M+0] = result[2];               \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)        \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);       \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
+  C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
+  C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
+
+#define SAVE_2x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
+
+#else
+
+#define SAVE_4x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);                 \
+  result[2] = vec_mul(result[2], valpha);               \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);                 \
+  result[3] = vec_mul(result[3], valpha);               \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);             \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);             \
+  result[2] = vec_mul(result[2], valpha);               \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);             \
+  result[3] = vec_mul(result[3], valpha);               \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)      \
+  result = vec_mul(result, valpha);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);  \
+  C[(N+1)*ldc+M+0] = result[2];         \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];       \
+  C[(N+2)*ldc+M] = result[2];       \
+  C[(N+3)*ldc+M] = result[3];
+
+#define SAVE_2x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#define LOAD_AT_16x4(M, K)            \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);     \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);     \
+  t0 = vec_mergeh(ra0, ra1);          \
+  t1 = vec_mergel(ra0, ra1);          \
+  ra2 = vec_xl(0, A+(M+2)*lda+K);     \
+  ra3 = vec_xl(0, A+(M+3)*lda+K);     \
+  t2 = vec_mergeh(ra2, ra3);          \
+  t3 = vec_mergel(ra2, ra3);          \
+  ra0 = vec_xxpermdi(t0, t2, 0b00);   \
+  ra1 = vec_xxpermdi(t0, t2, 0b11);   \
+  ra2 = vec_xxpermdi(t1, t3, 0b00);   \
+  ra3 = vec_xxpermdi(t1, t3, 0b11);   \
+  ra4 = vec_xl(0, A+(M+4)*lda+K);     \
+  ra5 = vec_xl(0, A+(M+5)*lda+K);     \
+  t0 = vec_mergeh(ra4, ra5);          \
+  t1 = vec_mergel(ra4, ra5);          \
+  ra6 = vec_xl(0, A+(M+6)*lda+K);     \
+  ra7 = vec_xl(0, A+(M+7)*lda+K);     \
+  t2 = vec_mergeh(ra6, ra7);          \
+  t3 = vec_mergel(ra6, ra7);          \
+  ra4 = vec_xxpermdi(t0, t2, 0b00);   \
+  ra5 = vec_xxpermdi(t0, t2, 0b11);   \
+  ra6 = vec_xxpermdi(t1, t3, 0b00);   \
+  ra7 = vec_xxpermdi(t1, t3, 0b11);   \
+  ra8 = vec_xl(0, A+(M+8)*lda+K);     \
+  ra9 = vec_xl(0, A+(M+9)*lda+K);     \
+  t0 = vec_mergeh(ra8, ra9);          \
+  t1 = vec_mergel(ra8, ra9);          \
+  ra10 = vec_xl(0, A+(M+10)*lda+K);   \
+  ra11 = vec_xl(0, A+(M+11)*lda+K);   \
+  t2 = vec_mergeh(ra10, ra11);        \
+  t3 = vec_mergel(ra10, ra11);        \
+  ra8 = vec_xxpermdi(t0, t2, 0b00);   \
+  ra9 = vec_xxpermdi(t0, t2, 0b11);   \
+  ra10 = vec_xxpermdi(t1, t3, 0b00);  \
+  ra11 = vec_xxpermdi(t1, t3, 0b11);  \
+  ra12 = vec_xl(0, A+(M+12)*lda+K);   \
+  ra13 = vec_xl(0, A+(M+13)*lda+K);   \
+  t0 = vec_mergeh(ra12, ra13);        \
+  t1 = vec_mergel(ra12, ra13);        \
+  ra14 = vec_xl(0, A+(M+14)*lda+K);   \
+  ra15 = vec_xl(0, A+(M+15)*lda+K);   \
+  t2 = vec_mergeh(ra14, ra15);        \
+  t3 = vec_mergel(ra14, ra15);        \
+  ra12 = vec_xxpermdi(t0, t2, 0b00);  \
+  ra13 = vec_xxpermdi(t0, t2, 0b11);  \
+  ra14 = vec_xxpermdi(t1, t3, 0b00);  \
+  ra15 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_AT_16x2(M, K)              \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8);   \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8);   \
+  t0 = vec_mergeh(ra0, ra1);            \
+  ra2 = vec_xl_len(A+(M+2)*lda+K, 8);   \
+  ra3 = vec_xl_len(A+(M+3)*lda+K, 8);   \
+  t1 = vec_mergeh(ra2, ra3);            \
+  ra0 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra1 = vec_xxpermdi(t0, t1, 0b11);     \
+  ra4 = vec_xl_len(A+(M+4)*lda+K, 8);   \
+  ra5 = vec_xl_len(A+(M+5)*lda+K, 8);   \
+  t0 = vec_mergeh(ra4, ra5);            \
+  ra6 = vec_xl_len(A+(M+6)*lda+K, 8);   \
+  ra7 = vec_xl_len(A+(M+7)*lda+K, 8);   \
+  t1 = vec_mergeh(ra6, ra7);            \
+  ra2 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra3 = vec_xxpermdi(t0, t1, 0b11);     \
+  ra8 = vec_xl_len(A+(M+8)*lda+K, 8);   \
+  ra9 = vec_xl_len(A+(M+9)*lda+K, 8);   \
+  t0 = vec_mergeh(ra8, ra9);            \
+  ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \
+  ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \
+  t1 = vec_mergeh(ra10, ra11);          \
+  ra4 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra5 = vec_xxpermdi(t0, t1, 0b11);     \
+  ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \
+  ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \
+  t0 = vec_mergeh(ra12, ra13);          \
+  ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \
+  ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \
+  t1 = vec_mergeh(ra14, ra15);          \
+  ra6 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra7 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_AT_16x1(M, K)                    \
+  ra0 = vec_xor(ra0, ra0);                    \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0);   \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);   \
+  ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2);   \
+  ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3);   \
+  ra1 = vec_xor(ra1, ra1);                    \
+  ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0);   \
+  ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1);   \
+  ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2);   \
+  ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3);   \
+  ra2 = vec_xor(ra2, ra2);                    \
+  ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0);   \
+  ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1);   \
+  ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2);  \
+  ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3);  \
+  ra3 = vec_xor(ra3, ra3);                    \
+  ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0);  \
+  ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1);  \
+  ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2);  \
+  ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3);
+
+#define LOAD_AT_8x4(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);   \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);   \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra2 = vec_xl(0, A+(M+2)*lda+K);   \
+  ra3 = vec_xl(0, A+(M+3)*lda+K);   \
+  t2 = vec_mergeh(ra2, ra3);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = vec_xxpermdi(t0, t2, 0b00); \
+  ra1 = vec_xxpermdi(t0, t2, 0b11); \
+  ra2 = vec_xxpermdi(t1, t3, 0b00); \
+  ra3 = vec_xxpermdi(t1, t3, 0b11); \
+  ra4 = vec_xl(0, A+(M+4)*lda+K);   \
+  ra5 = vec_xl(0, A+(M+5)*lda+K);   \
+  t0 = vec_mergeh(ra4, ra5);        \
+  t1 = vec_mergel(ra4, ra5);        \
+  ra6 = vec_xl(0, A+(M+6)*lda+K);   \
+  ra7 = vec_xl(0, A+(M+7)*lda+K);   \
+  t2 = vec_mergeh(ra6, ra7);        \
+  t3 = vec_mergel(ra6, ra7);        \
+  ra4 = vec_xxpermdi(t0, t2, 0b00); \
+  ra5 = vec_xxpermdi(t0, t2, 0b11); \
+  ra6 = vec_xxpermdi(t1, t3, 0b00); \
+  ra7 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_AT_8x2(M, K)             \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
+  t0 = vec_mergeh(ra0, ra1);          \
+  ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
+  ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
+  t1 = vec_mergeh(ra2, ra3);          \
+  ra0 = vec_xxpermdi(t0, t1, 0b00);   \
+  ra1 = vec_xxpermdi(t0, t1, 0b11);   \
+  ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \
+  ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \
+  t0 = vec_mergeh(ra4, ra5);          \
+  ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \
+  ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \
+  t1 = vec_mergeh(ra6, ra7);          \
+  ra2 = vec_xxpermdi(t0, t1, 0b00);   \
+  ra3 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_AT_8x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
+  ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \
+  ra1 = vec_xor(ra1, ra1);                  \
+  ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \
+  ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \
+  ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \
+  ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3);
+
+#define LOAD_AT_4x4(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);   \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);   \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra2 = vec_xl(0, A+(M+2)*lda+K);   \
+  ra3 = vec_xl(0, A+(M+3)*lda+K);   \
+  t2 = vec_mergeh(ra2, ra3);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = vec_xxpermdi(t0, t2, 0b00); \
+  ra1 = vec_xxpermdi(t0, t2, 0b11); \
+  ra2 = vec_xxpermdi(t1, t3, 0b00); \
+  ra3 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_AT_4x2(M, K)             \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
+  t0 = vec_mergeh(ra0, ra1);          \
+  ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
+  ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
+  t1 = vec_mergeh(ra2, ra3);          \
+  ra0 = vec_xxpermdi(t0, t1, 0b00);   \
+  ra1 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_AT_4x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
+  ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3);
+
+#define LOAD_AT_2x4(M, K)                       \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);               \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);               \
+  t0 = vec_mergeh(ra0, ra1);                    \
+  t1 = vec_mergeo(ra0, ra1);                    \
+  t2 = vec_mergel(ra0, ra1);                    \
+  ra0 = t0;                                     \
+  ra1 = t1;                                     \
+  ra2 = t2;                                     \
+  ra3 = vec_xor(ra3, ra3);                      \
+  ra3 = vec_insert(vec_extract(t2, 2), ra3, 0); \
+  ra3 = vec_insert(vec_extract(t2, 3), ra3, 1);
+
+#define LOAD_AT_2x2(M, K)             \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
+  t0 = vec_mergee(ra0, ra1);          \
+  t1 = vec_mergeo(ra0, ra1);          \
+  ra0 = t0;                           \
+  ra1 = t1;
+
+#define LOAD_AT_2x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
+
+#define LOAD_A_2x2(M, K)                    \
+  ra0 = vec_splats(A[(M+0)*lda+K]);         \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3);
+
+#define LOAD_A_2x1(M, K)                    \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
+
+#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]);
+
+#define LOAD_BT_16x4(N, K)            \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);     \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);     \
+  t0 = vec_mergeh(rb0, rb1);          \
+  t1 = vec_mergel(rb0, rb1);          \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K);     \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K);     \
+  t2 = vec_mergeh(rb2, rb3);          \
+  t3 = vec_mergel(rb2, rb3);          \
+  rb0 = vec_xxpermdi(t0, t2, 0b00);   \
+  rb1 = vec_xxpermdi(t0, t2, 0b11);   \
+  rb2 = vec_xxpermdi(t1, t3, 0b00);   \
+  rb3 = vec_xxpermdi(t1, t3, 0b11);   \
+  rb4 = vec_xl(0, B+(N+4)*ldb+K);     \
+  rb5 = vec_xl(0, B+(N+5)*ldb+K);     \
+  t0 = vec_mergeh(rb4, rb5);          \
+  t1 = vec_mergel(rb4, rb5);          \
+  rb6 = vec_xl(0, B+(N+6)*ldb+K);     \
+  rb7 = vec_xl(0, B+(N+7)*ldb+K);     \
+  t2 = vec_mergeh(rb6, rb7);          \
+  t3 = vec_mergel(rb6, rb7);          \
+  rb4 = vec_xxpermdi(t0, t2, 0b00);   \
+  rb5 = vec_xxpermdi(t0, t2, 0b11);   \
+  rb6 = vec_xxpermdi(t1, t3, 0b00);   \
+  rb7 = vec_xxpermdi(t1, t3, 0b11);   \
+  rb8 = vec_xl(0, B+(N+8)*ldb+K);     \
+  rb9 = vec_xl(0, B+(N+9)*ldb+K);     \
+  t0 = vec_mergeh(rb8, rb9);          \
+  t1 = vec_mergel(rb8, rb9);          \
+  rb10 = vec_xl(0, B+(N+10)*ldb+K);   \
+  rb11 = vec_xl(0, B+(N+11)*ldb+K);   \
+  t2 = vec_mergeh(rb10, rb11);        \
+  t3 = vec_mergel(rb10, rb11);        \
+  rb8 = vec_xxpermdi(t0, t2, 0b00);   \
+  rb9 = vec_xxpermdi(t0, t2, 0b11);   \
+  rb10 = vec_xxpermdi(t1, t3, 0b00);  \
+  rb11 = vec_xxpermdi(t1, t3, 0b11);  \
+  rb12 = vec_xl(0, B+(N+12)*ldb+K);   \
+  rb13 = vec_xl(0, B+(N+13)*ldb+K);   \
+  t0 = vec_mergeh(rb12, rb13);        \
+  t1 = vec_mergel(rb12, rb13);        \
+  rb14 = vec_xl(0, B+(N+14)*ldb+K);   \
+  rb15 = vec_xl(0, B+(N+15)*ldb+K);   \
+  t2 = vec_mergeh(rb14, rb15);        \
+  t3 = vec_mergel(rb14, rb15);        \
+  rb12 = vec_xxpermdi(t0, t2, 0b00);  \
+  rb13 = vec_xxpermdi(t0, t2, 0b11);  \
+  rb14 = vec_xxpermdi(t1, t3, 0b00);  \
+  rb15 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_BT_16x2(N, K)              \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8);   \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8);   \
+  rb2 = vec_xl_len(B+(N+2)*ldb+K, 8);   \
+  rb3 = vec_xl_len(B+(N+3)*ldb+K, 8);   \
+  t0 = vec_mergeh(rb0, rb1);            \
+  t1 = vec_mergeh(rb2, rb3);            \
+  rb0 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb1 = vec_xxpermdi(t0, t1, 0b11);     \
+  rb4 = vec_xl_len(B+(N+4)*ldb+K, 8);   \
+  rb5 = vec_xl_len(B+(N+5)*ldb+K, 8);   \
+  rb6 = vec_xl_len(B+(N+6)*ldb+K, 8);   \
+  rb7 = vec_xl_len(B+(N+7)*ldb+K, 8);   \
+  t0 = vec_mergeh(rb4, rb5);            \
+  t1 = vec_mergeh(rb6, rb7);            \
+  rb2 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb3 = vec_xxpermdi(t0, t1, 0b11);     \
+  rb8 = vec_xl_len(B+(N+8)*ldb+K, 8);   \
+  rb9 = vec_xl_len(B+(N+9)*ldb+K, 8);   \
+  rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \
+  rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \
+  t0 = vec_mergeh(rb8, rb9);            \
+  t1 = vec_mergeh(rb10, rb11);          \
+  rb4 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb5 = vec_xxpermdi(t0, t1, 0b11);     \
+  rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \
+  rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \
+  rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \
+  rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \
+  t0 = vec_mergeh(rb12, rb13);          \
+  t1 = vec_mergeh(rb14, rb15);          \
+  rb6 = vec_xxpermdi(t0, t1, 0b00);     \
+  rb7 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_BT_16x1(N, K)                    \
+  rb0 = vec_xor(rb0, rb0);                    \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0);   \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);   \
+  rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2);   \
+  rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3);   \
+  rb1 = vec_xor(rb1, rb1);                    \
+  rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0);   \
+  rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1);   \
+  rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2);   \
+  rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3);   \
+  rb2 = vec_xor(rb2, rb2);                    \
+  rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0);   \
+  rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1);   \
+  rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2);  \
+  rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3);  \
+  rb3 = vec_xor(rb3, rb3);                    \
+  rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0);  \
+  rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1);  \
+  rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2);  \
+  rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3);
+
+#define LOAD_BT_8x4(N, K)           \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);   \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);   \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergel(rb0, rb1);        \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K);   \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K);   \
+  t2 = vec_mergeh(rb2, rb3);        \
+  t3 = vec_mergel(rb2, rb3);        \
+  rb0 = vec_xxpermdi(t0, t2, 0b00); \
+  rb1 = vec_xxpermdi(t0, t2, 0b11); \
+  rb2 = vec_xxpermdi(t1, t3, 0b00); \
+  rb3 = vec_xxpermdi(t1, t3, 0b11); \
+  rb4 = vec_xl(0, B+(N+4)*ldb+K);   \
+  rb5 = vec_xl(0, B+(N+5)*ldb+K);   \
+  t0 = vec_mergeh(rb4, rb5);        \
+  t1 = vec_mergel(rb4, rb5);        \
+  rb6 = vec_xl(0, B+(N+6)*ldb+K);   \
+  rb7 = vec_xl(0, B+(N+7)*ldb+K);   \
+  t2 = vec_mergeh(rb6, rb7);        \
+  t3 = vec_mergel(rb6, rb7);        \
+  rb4 = vec_xxpermdi(t0, t2, 0b00); \
+  rb5 = vec_xxpermdi(t0, t2, 0b11); \
+  rb6 = vec_xxpermdi(t1, t3, 0b00); \
+  rb7 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_BT_8x2(N, K)             \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
+  t0 = vec_mergeh(rb0, rb1);          \
+  rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
+  rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
+  t1 = vec_mergeh(rb2, rb3);          \
+  rb0 = vec_xxpermdi(t0, t1, 0b00);   \
+  rb1 = vec_xxpermdi(t0, t1, 0b11);   \
+  rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \
+  rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \
+  t0 = vec_mergeh(rb4, rb5);          \
+  rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \
+  rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \
+  t1 = vec_mergeh(rb6, rb7);          \
+  rb2 = vec_xxpermdi(t0, t1, 0b00);   \
+  rb3 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_BT_8x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
+  rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \
+  rb1 = vec_xor(rb1, rb1);                  \
+  rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \
+  rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \
+  rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \
+  rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3);
+
+#define LOAD_BT_4x4(N, K)           \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);   \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);   \
+  t0 = vec_mergeh(rb0, rb1);        \
+  t1 = vec_mergel(rb0, rb1);        \
+  rb2 = vec_xl(0, B+(N+2)*ldb+K);   \
+  rb3 = vec_xl(0, B+(N+3)*ldb+K);   \
+  t2 = vec_mergeh(rb2, rb3);        \
+  t3 = vec_mergel(rb2, rb3);        \
+  rb0 = vec_xxpermdi(t0, t2, 0b00); \
+  rb1 = vec_xxpermdi(t0, t2, 0b11); \
+  rb2 = vec_xxpermdi(t1, t3, 0b00); \
+  rb3 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_BT_4x2(N, K)             \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
+  t0 = vec_mergeh(rb0, rb1);          \
+  rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \
+  rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \
+  t1 = vec_mergeh(rb2, rb3);          \
+  rb0 = vec_xxpermdi(t0, t1, 0b00);   \
+  rb1 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_BT_4x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \
+  rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \
+  rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3);
+
+#define LOAD_BT_2x4(N, K)                       \
+  rb0 = vec_xl(0, B+(N+0)*ldb+K);               \
+  rb1 = vec_xl(0, B+(N+1)*ldb+K);               \
+  t0 = vec_mergeh(rb0, rb1);                    \
+  t1 = vec_mergeo(rb0, rb1);                    \
+  t2 = vec_mergel(rb0, rb1);                    \
+  rb0 = t0;                                     \
+  rb1 = t1;                                     \
+  rb2 = t2;                                     \
+  rb3 = vec_xor(rb3, rb3);                      \
+  rb3 = vec_insert(vec_extract(t2,2), rb3, 0);  \
+  rb3 = vec_insert(vec_extract(t2,3), rb3, 1);
+
+#define LOAD_BT_2x2(N, K)             \
+  rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \
+  rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \
+  t0 = vec_mergee(rb0, rb1);          \
+  t1 = vec_mergeo(rb0, rb1);          \
+  rb0 = t0;                           \
+  rb1 = t1;
+
+#define LOAD_BT_2x1(N, K)                   \
+  rb0 = vec_xor(rb0, rb0);                  \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);
+
+#define LOAD_B_2x2(N, K)                    \
+  rb0 = vec_splats(B[(N+0)*ldb+K]);         \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3);
+
+#define LOAD_B_2x1(N, K)                    \
+  rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \
+  rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1);
+
+#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7,   \
+                        a0, a1, a2, a3, a4, a5, a6, a7)   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);  \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2);  \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);  \
+  __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4);  \
+  __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5);  \
+  __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6);  \
+  __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3)   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);  \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2);  \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)                   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                           \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#define PACK_A(ra0, ra1, ra2, ra3, offset) \
+  vec_xst(ra0, 0, packA+(k*16)+0+offset);  \
+  vec_xst(ra1, 0, packA+(k*16)+4+offset);  \
+  vec_xst(ra2, 0, packA+(k*16)+8+offset);  \
+  vec_xst(ra3, 0, packA+(k*16)+12+offset);
+
+#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
+  ra0 = vec_xl(0, packA+(k*16)+0+offset);         \
+  ra1 = vec_xl(0, packA+(k*16)+4+offset);         \
+  ra2 = vec_xl(0, packA+(k*16)+8+offset);         \
+  ra3 = vec_xl(0, packA+(k*16)+12+offset);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m16 = M & ~15;
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n16 = N & ~15;
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  BLASLONG k4 = K & ~3;
+  BLASLONG k2 = K & ~1;
+
+  vector float valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector float vbeta = vec_splats(beta);
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+  int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
+#else
+  int has_packing = 0;
+#endif
+
+  float *packA;
+  if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
+
+  for (m = 0; m < m16; m += 16) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      if (has_packing) {
+        if (n == 0) {
+          for (k = 0; k < k4; k += 4) {
+            LOAD_AT_16x4(m, k);
+            LOAD_BT_8x4(n, k);
+            KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4,
+                            ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12);
+            PACK_A(ra0, ra4, ra8, ra12, 0);
+            KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5,
+                            ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13);
+            PACK_A(ra1, ra5, ra9, ra13, 16);
+            KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6,
+                            ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14);
+            PACK_A(ra2, ra6, ra10, ra14, 32);
+            KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7,
+                            ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15);
+            PACK_A(ra3, ra7, ra11, ra15, 48);
+          }
+          for (; k < k2; k += 2) {
+            LOAD_AT_16x2(m, k);
+            LOAD_BT_8x2(n, k);
+            KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2,
+                            ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6);
+            PACK_A(ra0, ra2, ra4, ra6, 0);
+            KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3,
+                            ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7);
+            PACK_A(ra1, ra3, ra5, ra7, 16);
+          }
+          for (; k < K; k++) {
+            LOAD_AT_16x1(m, k);
+            LOAD_BT_8x1(n, k);
+            KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1,
+                            ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3);
+            PACK_A(ra0, ra1, ra2, ra3, 0);
+          }
+        } else {
+          for (k = 0; k < k4; k += 4) {
+              LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+              LOAD_BT_8x4(n, k);
+              KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4,
+                              ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12);
+              LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+              KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5,
+                              ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13);
+              LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+              KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6,
+                              ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14);
+              LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+              KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7,
+                              ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15);
+          }
+          for (; k < k2; k += 2) {
+            LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+            LOAD_BT_8x2(n, k);
+            KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2,
+                            ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6);
+            LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+            KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3,
+                            ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7);
+          }
+          for (; k < K; k++) {
+            LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+            LOAD_BT_8x1(n, k);
+            KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1,
+                            ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3);
+          }
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_BT_8x4(n, k);
+          KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4,
+                          ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12);
+          KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5,
+                          ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13);
+          KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6,
+                          ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14);
+          KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7,
+                          ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_BT_8x2(n, k);
+          KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2,
+                          ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6);
+          KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3,
+                          ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_BT_8x1(n, k);
+          KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1,
+                          ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc4, n+0, m+8);
+      SAVE_4x4_ACC(&acc5, n+0, m+12);
+      SAVE_4x4_ACC(&acc2, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+      SAVE_4x4_ACC(&acc6, n+4, m+8);
+      SAVE_4x4_ACC(&acc7, n+4, m+12);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_BT_4x4(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
+          KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
+          KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_BT_4x2(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_BT_4x1(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+          LOAD_BT_4x4(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
+          LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+          KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
+          LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+          KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_BT_4x2(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_BT_4x1(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+0, m+8);
+      SAVE_4x4_ACC(&acc3, n+0, m+12);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_BT_2x4(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
+          KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
+          KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_BT_2x2(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_BT_2x1(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+          LOAD_BT_2x4(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13);
+          LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+          KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14);
+          LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+          KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_BT_2x2(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+          KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_BT_2x1(n, k);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n+0, m+0);
+      SAVE_2x4_ACC(&acc1, n+0, m+4);
+      SAVE_2x4_ACC(&acc2, n+0, m+8);
+      SAVE_2x4_ACC(&acc3, n+0, m+12);
+    }
+
+    for (; n < N; n++) {
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+      vector float result2 = ((vector float){0.,0.,0.,0.});
+      vector float result3 = ((vector float){0.,0.,0.,0.});
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_B_1x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(n, k+1);
+          KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(n, k+2);
+          KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(n, k+3);
+          KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_B_1x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(n, k+1);
+          KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_B_1x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+          LOAD_B_1x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+          LOAD_B_1x1(n, k+1);
+          KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+          LOAD_B_1x1(n, k+2);
+          KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+          LOAD_B_1x1(n, k+3);
+          KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_B_1x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+          LOAD_B_1x1(n, k+1);
+          KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x1(n, k);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m+0);
+      SAVE_1x4_VSR(result1, n, m+4);
+      SAVE_1x4_VSR(result2, n, m+8);
+      SAVE_1x4_VSR(result3, n, m+12);
+    }
+  }
+
+  for (; m < m8; m += 8) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
+          rb10, rb11, rb12, rb13, rb14, rb15;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_BT_16x4(n, k);
+        KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb8, rb8, rb12, rb12,
+                        ra0, ra4, ra0, ra4, ra0, ra4, ra0, ra4);
+        KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb9, rb9, rb13, rb13,
+                        ra1, ra5, ra1, ra5, ra1, ra5, ra1, ra5);
+        KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb10, rb10, rb14, rb14,
+                        ra2, ra6, ra2, ra6, ra2, ra6, ra2, ra6);
+        KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb11, rb11, rb15, rb15,
+                        ra3, ra7, ra3, ra7, ra3, ra7, ra3, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BT_16x2(n, k);
+        KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb4, rb4, rb6, rb6,
+                        ra0, ra2, ra0, ra2, ra0, ra2, ra0, ra2);
+        KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb5, rb5, rb7, rb7,
+                        ra1, ra3, ra1, ra3, ra1, ra3, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_BT_16x1(n, k);
+        KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb2, rb2, rb3, rb3,
+                        ra0, ra1, ra0, ra1, ra0, ra1, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+      SAVE_4x4_ACC(&acc4, n+8, m+0);
+      SAVE_4x4_ACC(&acc5, n+8, m+4);
+      SAVE_4x4_ACC(&acc6, n+12, m+0);
+      SAVE_4x4_ACC(&acc7, n+12, m+4);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb4, rb4, ra0, ra4, ra0, ra4);
+        KERNEL_MMA_4ACC(rb1, rb1, rb5, rb5, ra1, ra5, ra1, ra5);
+        KERNEL_MMA_4ACC(rb2, rb2, rb6, rb6, ra2, ra6, ra2, ra6);
+        KERNEL_MMA_4ACC(rb3, rb3, rb7, rb7, ra3, ra7, ra3, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb2, rb2, ra0, ra2, ra0, ra2);
+        KERNEL_MMA_4ACC(rb1, rb1, rb3, rb3, ra1, ra3, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_4ACC(rb0, rb0, rb1, rb1, ra0, ra1, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4);
+        KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5);
+        KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6);
+        KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2);
+        KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_BT_2x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4);
+        KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5);
+        KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6);
+        KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_BT_2x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2);
+        KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_BT_2x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+    }
+
+    for (; n < N; n++) {
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0);
+        LOAD_B_1x1(n, k+1);
+        KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0);
+        LOAD_B_1x1(n, k+2);
+        KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0);
+        LOAD_B_1x1(n, k+3);
+        KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0);
+        LOAD_B_1x1(n, k+1);
+        KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+    }
+  }
+
+  for (; m < m4; m += 4) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9,
+          rb10, rb11, rb12, rb13, rb14, rb15;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_BT_16x4(n, k);
+        KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0);
+        KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra1, ra1, ra1, ra1);
+        KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra2, ra2, ra2, ra2);
+        KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra3, ra3, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BT_16x2(n, k);
+        KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0);
+        KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra1, ra1, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BT_16x1(n, k);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0);
+        KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1);
+        KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2);
+        KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0);
+        KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        KERNEL_MMA_1ACC(rb1, ra1);
+        KERNEL_MMA_1ACC(rb2, ra2);
+        KERNEL_MMA_1ACC(rb3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        KERNEL_MMA_1ACC(rb1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_BT_2x4(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        KERNEL_MMA_1ACC(rb1, ra1);
+        KERNEL_MMA_1ACC(rb2, ra2);
+        KERNEL_MMA_1ACC(rb3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_BT_2x2(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        KERNEL_MMA_1ACC(rb1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_BT_2x1(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m);
+    }
+
+    for (; n < N; n++) {
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_B_1x1(n, k+1);
+        KERNEL_VMADD_1VSR(ra1, rb0);
+        LOAD_B_1x1(n, k+2);
+        KERNEL_VMADD_1VSR(ra2, rb0);
+        LOAD_B_1x1(n, k+3);
+        KERNEL_VMADD_1VSR(ra3, rb0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_B_1x1(n, k+1);
+        KERNEL_VMADD_1VSR(ra1, rb0);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+    }
+  }
+
+  for (; m < m2; m += 2) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_2x4(m, k);
+        LOAD_BT_8x4(n, k);
+        KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0);
+        KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1);
+        KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2);
+        KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BT_8x2(n, k);
+        KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0);
+        KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BT_8x1(n, k);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_2x4(m, k);
+        LOAD_BT_4x4(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        KERNEL_MMA_1ACC(rb1, ra1);
+        KERNEL_MMA_1ACC(rb2, ra2);
+        KERNEL_MMA_1ACC(rb3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_BT_4x2(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        KERNEL_MMA_1ACC(rb1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_BT_4x1(n, k);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_2x2(m, k);
+        LOAD_B_2x2(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_2x2_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0 = ((vector float){0.,0.,0.,0.});
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_2x1(m, k);
+        LOAD_B_1x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x2_VSR(result, n, m);
+    }
+  }
+
+  for (; m < M; m++) {
+    for (n = 0; n < n8; n += 8) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x1(m, k);
+        LOAD_BT_8x4(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4);
+        LOAD_A_1x1(m, k+1);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5);
+        LOAD_A_1x1(m, k+2);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6);
+        LOAD_A_1x1(m, k+3);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x1(m, k);
+        LOAD_BT_8x2(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2);
+        LOAD_A_1x1(m, k+1);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_BT_8x1(n, k);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+    }
+
+    for (; n < n4; n += 4) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_A_1x1(m, k);
+        LOAD_BT_4x4(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_A_1x1(m, k+1);
+        KERNEL_VMADD_1VSR(ra0, rb1);
+        LOAD_A_1x1(m, k+2);
+        KERNEL_VMADD_1VSR(ra0, rb2);
+        LOAD_A_1x1(m, k+3);
+        KERNEL_VMADD_1VSR(ra0, rb3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_A_1x1(m, k);
+        LOAD_BT_4x2(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_A_1x1(m, k+1);
+        KERNEL_VMADD_1VSR(ra0, rb1);
+      }
+      for (; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_BT_4x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0 = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_2x1(n, k);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_2x1_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      FLOAT result = 0.0f;
+
+      for (k = 0; k < K; k++) {
+        result += A[m*lda+k] * B[n*ldb+k];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  if (has_packing) free (packA);
+
+  return 0;
+}
diff --git a/kernel/power/sgemm_small_kernel_tt_power10.c b/kernel/power/sgemm_small_kernel_tt_power10.c
new file mode 100644
index 000000000..71bc7b937
--- /dev/null
+++ b/kernel/power/sgemm_small_kernel_tt_power10.c
@@ -0,0 +1,1559 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char vec_t;
+
+#if !defined(B0)
+#define SAVE_4x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+2)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[2] = vec_madd(result[2], valpha, rc0);         \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+3)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[3] = vec_madd(result[3], valpha, rc0);         \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl_len(C+(N+0)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+1)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+2)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[2] = vec_madd(result[2], valpha, rc0);         \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);             \
+  rc0 = vec_xl_len(C+(N+3)*ldc+M, 8);                   \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[3] = vec_madd(result[3], valpha, rc0);         \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  rc0 = vec_xl(0, C+(N+0)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[0] = vec_madd(result[0], valpha, rc0);         \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  rc0 = vec_xl(0, C+(N+1)*ldc+M);                       \
+  rc0 = vec_mul(rc0, vbeta);                            \
+  result[1] = vec_madd(result[1], valpha, rc0);         \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)        \
+  rc0 = vec_xl(0, C+((N)*ldc)+M);         \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)            \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);           \
+  rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \
+  rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \
+  rc0 = vec_mul(rc0, vbeta);                  \
+  result = vec_madd(result, valpha, rc0);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);        \
+  C[(N+1)*ldc+M+0] = result[2];               \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)        \
+  rc0 = vec_xl_len(C+(N*ldc)+M, 8);       \
+  rc0 = vec_mul(rc0, vbeta);              \
+  result = vec_madd(result, valpha, rc0); \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \
+  C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \
+  C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3];
+
+#define SAVE_2x1_VSR(result, N, M)                      \
+  result = vec_mul(result, valpha);                     \
+  C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \
+  C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1];
+
+#else
+
+#define SAVE_4x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);                 \
+  result[2] = vec_mul(result[2], valpha);               \
+  vec_xst(result[2], 0, C+(N+2)*ldc+M);                 \
+  result[3] = vec_mul(result[3], valpha);               \
+  vec_xst(result[3], 0, C+(N+3)*ldc+M);
+
+#define SAVE_4x2_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst_len(result[0], C+(N+0)*ldc+M, 8);             \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst_len(result[1], C+(N+1)*ldc+M, 8);             \
+  result[2] = vec_mul(result[2], valpha);               \
+  vec_xst_len(result[2], C+(N+2)*ldc+M, 8);             \
+  result[3] = vec_mul(result[3], valpha);               \
+  vec_xst_len(result[3], C+(N+3)*ldc+M, 8);
+
+#define SAVE_2x4_ACC(ACC, N, M)                         \
+  __builtin_mma_disassemble_acc ((void *)result, ACC);  \
+  result[0] = vec_mul(result[0], valpha);               \
+  vec_xst(result[0], 0, C+(N+0)*ldc+M);                 \
+  result[1] = vec_mul(result[1], valpha);               \
+  vec_xst(result[1], 0, C+(N+1)*ldc+M);
+
+#define SAVE_1x4_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst(result, 0, C+((N)*ldc)+M);
+
+#define SAVE_2x2_VSR(result, N, M)      \
+  result = vec_mul(result, valpha);     \
+  vec_xst_len(result, C+(N*ldc)+M, 8);  \
+  C[(N+1)*ldc+M+0] = result[2];         \
+  C[(N+1)*ldc+M+1] = result[3];
+
+#define SAVE_1x2_VSR(result, N, M)    \
+  result = vec_mul(result, valpha);   \
+  vec_xst_len(result, C+(N*ldc)+M, 8);
+
+#define SAVE_4x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];       \
+  C[(N+2)*ldc+M] = result[2];       \
+  C[(N+3)*ldc+M] = result[3];
+
+#define SAVE_2x1_VSR(result, N, M)  \
+  result = vec_mul(result, valpha); \
+  C[(N+0)*ldc+M] = result[0];       \
+  C[(N+1)*ldc+M] = result[1];
+
+#endif
+
+#define INIT_8ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3); \
+  __builtin_mma_xxsetaccz(&acc4); \
+  __builtin_mma_xxsetaccz(&acc5); \
+  __builtin_mma_xxsetaccz(&acc6); \
+  __builtin_mma_xxsetaccz(&acc7);
+
+#define INIT_4ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1); \
+  __builtin_mma_xxsetaccz(&acc2); \
+  __builtin_mma_xxsetaccz(&acc3);
+
+#define INIT_2ACCS()              \
+  __builtin_mma_xxsetaccz(&acc0); \
+  __builtin_mma_xxsetaccz(&acc1);
+
+#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
+
+#define LOAD_AT_16x4(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);    \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);    \
+  t0 = vec_mergeh(ra0, ra1);         \
+  t1 = vec_mergel(ra0, ra1);         \
+  ra2 = vec_xl(0, A+(M+2)*lda+K);    \
+  ra3 = vec_xl(0, A+(M+3)*lda+K);    \
+  t2 = vec_mergeh(ra2, ra3);         \
+  t3 = vec_mergel(ra2, ra3);         \
+  ra0 = vec_xxpermdi(t0, t2, 0b00);  \
+  ra1 = vec_xxpermdi(t0, t2, 0b11);  \
+  ra2 = vec_xxpermdi(t1, t3, 0b00);  \
+  ra3 = vec_xxpermdi(t1, t3, 0b11);  \
+  ra4 = vec_xl(0, A+(M+4)*lda+K);    \
+  ra5 = vec_xl(0, A+(M+5)*lda+K);    \
+  t0 = vec_mergeh(ra4, ra5);         \
+  t1 = vec_mergel(ra4, ra5);         \
+  ra6 = vec_xl(0, A+(M+6)*lda+K);    \
+  ra7 = vec_xl(0, A+(M+7)*lda+K);    \
+  t2 = vec_mergeh(ra6, ra7);         \
+  t3 = vec_mergel(ra6, ra7);         \
+  ra4 = vec_xxpermdi(t0, t2, 0b00);  \
+  ra5 = vec_xxpermdi(t0, t2, 0b11);  \
+  ra6 = vec_xxpermdi(t1, t3, 0b00);  \
+  ra7 = vec_xxpermdi(t1, t3, 0b11);  \
+  ra8 = vec_xl(0, A+(M+8)*lda+K);    \
+  ra9 = vec_xl(0, A+(M+9)*lda+K);    \
+  t0 = vec_mergeh(ra8, ra9);         \
+  t1 = vec_mergel(ra8, ra9);         \
+  ra10 = vec_xl(0, A+(M+10)*lda+K);  \
+  ra11 = vec_xl(0, A+(M+11)*lda+K);  \
+  t2 = vec_mergeh(ra10, ra11);       \
+  t3 = vec_mergel(ra10, ra11);       \
+  ra8 = vec_xxpermdi(t0, t2, 0b00);  \
+  ra9 = vec_xxpermdi(t0, t2, 0b11);  \
+  ra10 = vec_xxpermdi(t1, t3, 0b00); \
+  ra11 = vec_xxpermdi(t1, t3, 0b11); \
+  ra12 = vec_xl(0, A+(M+12)*lda+K);  \
+  ra13 = vec_xl(0, A+(M+13)*lda+K);  \
+  t0 = vec_mergeh(ra12, ra13);       \
+  t1 = vec_mergel(ra12, ra13);       \
+  ra14 = vec_xl(0, A+(M+14)*lda+K);  \
+  ra15 = vec_xl(0, A+(M+15)*lda+K);  \
+  t2 = vec_mergeh(ra14, ra15);       \
+  t3 = vec_mergel(ra14, ra15);       \
+  ra12 = vec_xxpermdi(t0, t2, 0b00); \
+  ra13 = vec_xxpermdi(t0, t2, 0b11); \
+  ra14 = vec_xxpermdi(t1, t3, 0b00); \
+  ra15 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_AT_16x2(M, K)              \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8);   \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8);   \
+  t0 = vec_mergeh(ra0, ra1);            \
+  ra2 = vec_xl_len(A+(M+2)*lda+K, 8);   \
+  ra3 = vec_xl_len(A+(M+3)*lda+K, 8);   \
+  t1 = vec_mergeh(ra2, ra3);            \
+  ra0 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra1 = vec_xxpermdi(t0, t1, 0b11);     \
+  ra4 = vec_xl_len(A+(M+4)*lda+K, 8);   \
+  ra5 = vec_xl_len(A+(M+5)*lda+K, 8);   \
+  t0 = vec_mergeh(ra4, ra5);            \
+  ra6 = vec_xl_len(A+(M+6)*lda+K, 8);   \
+  ra7 = vec_xl_len(A+(M+7)*lda+K, 8);   \
+  t1 = vec_mergeh(ra6, ra7);            \
+  ra2 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra3 = vec_xxpermdi(t0, t1, 0b11);     \
+  ra8 = vec_xl_len(A+(M+8)*lda+K, 8);   \
+  ra9 = vec_xl_len(A+(M+9)*lda+K, 8);   \
+  t0 = vec_mergeh(ra8, ra9);            \
+  ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \
+  ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \
+  t1 = vec_mergeh(ra10, ra11);          \
+  ra4 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra5 = vec_xxpermdi(t0, t1, 0b11);     \
+  ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \
+  ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \
+  t0 = vec_mergeh(ra12, ra13);          \
+  ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \
+  ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \
+  t1 = vec_mergeh(ra14, ra15);          \
+  ra6 = vec_xxpermdi(t0, t1, 0b00);     \
+  ra7 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_AT_16x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                   \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0);  \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);  \
+  ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2);  \
+  ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3);  \
+  ra1 = vec_xor(ra1, ra1);                   \
+  ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0);  \
+  ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1);  \
+  ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2);  \
+  ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3);  \
+  ra2 = vec_xor(ra2, ra2);                   \
+  ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0);  \
+  ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1);  \
+  ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \
+  ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \
+  ra3 = vec_xor(ra3, ra3);                   \
+  ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \
+  ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \
+  ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \
+  ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3);
+
+#define LOAD_AT_8x4(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);   \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);   \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra2 = vec_xl(0, A+(M+2)*lda+K);   \
+  ra3 = vec_xl(0, A+(M+3)*lda+K);   \
+  t2 = vec_mergeh(ra2, ra3);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = vec_xxpermdi(t0, t2, 0b00); \
+  ra1 = vec_xxpermdi(t0, t2, 0b11); \
+  ra2 = vec_xxpermdi(t1, t3, 0b00); \
+  ra3 = vec_xxpermdi(t1, t3, 0b11); \
+  ra4 = vec_xl(0, A+(M+4)*lda+K);   \
+  ra5 = vec_xl(0, A+(M+5)*lda+K);   \
+  t0 = vec_mergeh(ra4, ra5);        \
+  t1 = vec_mergel(ra4, ra5);        \
+  ra6 = vec_xl(0, A+(M+6)*lda+K);   \
+  ra7 = vec_xl(0, A+(M+7)*lda+K);   \
+  t2 = vec_mergeh(ra6, ra7);        \
+  t3 = vec_mergel(ra6, ra7);        \
+  ra4 = vec_xxpermdi(t0, t2, 0b00); \
+  ra5 = vec_xxpermdi(t0, t2, 0b11); \
+  ra6 = vec_xxpermdi(t1, t3, 0b00); \
+  ra7 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_AT_8x2(M, K)             \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
+  t0 = vec_mergeh(ra0, ra1);          \
+  ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
+  ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
+  t1 = vec_mergeh(ra2, ra3);          \
+  ra0 = vec_xxpermdi(t0, t1, 0b00);   \
+  ra1 = vec_xxpermdi(t0, t1, 0b11);   \
+  ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \
+  ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \
+  t0 = vec_mergeh(ra4, ra5);          \
+  ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \
+  ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \
+  t1 = vec_mergeh(ra6, ra7);          \
+  ra2 = vec_xxpermdi(t0, t1, 0b00);   \
+  ra3 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_AT_8x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
+  ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \
+  ra1 = vec_xor(ra1, ra1);                  \
+  ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \
+  ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \
+  ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \
+  ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3);
+
+#define LOAD_AT_4x4(M, K)           \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);   \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);   \
+  t0 = vec_mergeh(ra0, ra1);        \
+  t1 = vec_mergel(ra0, ra1);        \
+  ra2 = vec_xl(0, A+(M+2)*lda+K);   \
+  ra3 = vec_xl(0, A+(M+3)*lda+K);   \
+  t2 = vec_mergeh(ra2, ra3);        \
+  t3 = vec_mergel(ra2, ra3);        \
+  ra0 = vec_xxpermdi(t0, t2, 0b00); \
+  ra1 = vec_xxpermdi(t0, t2, 0b11); \
+  ra2 = vec_xxpermdi(t1, t3, 0b00); \
+  ra3 = vec_xxpermdi(t1, t3, 0b11);
+
+#define LOAD_AT_4x2(M, K)             \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
+  t0 = vec_mergeh(ra0, ra1);          \
+  ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \
+  ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \
+  t1 = vec_mergeh(ra2, ra3);          \
+  ra0 = vec_xxpermdi(t0, t1, 0b00);   \
+  ra1 = vec_xxpermdi(t0, t1, 0b11);
+
+#define LOAD_AT_4x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \
+  ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3);
+
+#define LOAD_AT_2x4(M, K)                       \
+  ra0 = vec_xl(0, A+(M+0)*lda+K);               \
+  ra1 = vec_xl(0, A+(M+1)*lda+K);               \
+  t0 = vec_mergeh(ra0, ra1);                    \
+  t1 = vec_mergeo(ra0, ra1);                    \
+  t2 = vec_mergel(ra0, ra1);                    \
+  ra0 = t0;                                     \
+  ra1 = t1;                                     \
+  ra2 = t2;                                     \
+  ra3 = vec_xor(ra3, ra3);                      \
+  ra3 = vec_insert(vec_extract(t2,2), ra3, 0);  \
+  ra3 = vec_insert(vec_extract(t2,3), ra3, 1);
+
+#define LOAD_AT_2x2(M, K)             \
+  ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \
+  ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \
+  t0 = vec_mergee(ra0, ra1);          \
+  t1 = vec_mergeo(ra0, ra1);          \
+  ra0 = t0;                           \
+  ra1 = t1;
+
+#define LOAD_AT_2x1(M, K)                   \
+  ra0 = vec_xor(ra0, ra0);                  \
+  ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1);
+
+#define LOAD_A_2x2(M, K)                    \
+  ra0 = vec_splats(A[M*lda+K]);             \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \
+  ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3);
+
+#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[M*lda+K]);
+
+#define LOAD_B_1x16(K, N)            \
+  rb0 = vec_xl(0, B+((K)*ldb)+N+0); \
+  rb1 = vec_xl(0, B+((K)*ldb)+N+4); \
+  rb2 = vec_xl(0, B+((K)*ldb)+N+8); \
+  rb3 = vec_xl(0, B+((K)*ldb)+N+12);
+
+#define LOAD_B_1x8(K, N)            \
+  rb0 = vec_xl(0, B+((K)*ldb)+N+0); \
+  rb1 = vec_xl(0, B+((K)*ldb)+N+4);
+
+#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+((K)*ldb)+N);
+
+#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+((K)*ldb)+N, 8);
+
+#define LOAD_B_2x2(K, N)                  \
+  rb0 = vec_splats(B[K*ldb+N]);         \
+  rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \
+  rb0 = vec_insert(B[K*ldb+N+1], rb0, 3);
+
+#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]);
+
+#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7,   \
+                        a0, a1, a2, a3, a4, a5, a6, a7)   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);  \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2);  \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);  \
+  __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4);  \
+  __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5);  \
+  __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6);  \
+  __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7);
+
+#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3)   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);  \
+  __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2);  \
+  __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3);
+
+#define KERNEL_MMA_2ACC(b0, b1, a0, a1)                   \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);  \
+  __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1);
+
+#define KERNEL_MMA_1ACC(b0, a0)                           \
+  __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0);
+
+#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \
+  result = vec_madd(a0, b0, result);                      \
+  result1 = vec_madd(a1, b1, result1);                    \
+  result2 = vec_madd(a2, b2, result2);                    \
+  result3 = vec_madd(a3, b3, result3);
+
+#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \
+  result = vec_madd(a0, b0, result);      \
+  result1 = vec_madd(a1, b1, result1);
+
+#define KERNEL_VMADD_1VSR(a0, b0)     \
+  result = vec_madd(a0, b0, result);
+
+#define PACK_A(ra0, ra1, ra2, ra3, offset) \
+  vec_xst(ra0, 0, packA+(k*16)+0+offset);  \
+  vec_xst(ra1, 0, packA+(k*16)+4+offset);  \
+  vec_xst(ra2, 0, packA+(k*16)+8+offset);  \
+  vec_xst(ra3, 0, packA+(k*16)+12+offset);
+
+#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \
+  ra0 = vec_xl(0, packA+(k*16)+0+offset);         \
+  ra1 = vec_xl(0, packA+(k*16)+4+offset);         \
+  ra2 = vec_xl(0, packA+(k*16)+8+offset);         \
+  ra3 = vec_xl(0, packA+(k*16)+12+offset);
+
+#ifdef B0
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG m, n, k;
+
+  BLASLONG m16 = M & ~15;
+  BLASLONG m8 = M & ~7;
+  BLASLONG m4 = M & ~3;
+  BLASLONG m2 = M & ~1;
+
+  BLASLONG n16 = N & ~15;
+  BLASLONG n8 = N & ~7;
+  BLASLONG n4 = N & ~3;
+  BLASLONG n2 = N & ~1;
+
+  BLASLONG k4 = K & ~3;
+  BLASLONG k2 = K & ~1;
+
+  vector float valpha = vec_splats(alpha);
+#if !defined(B0)
+  vector float vbeta = vec_splats(beta);
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+  int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0;
+#else
+  int has_packing = 0;
+#endif
+
+  float *packA;
+  if (has_packing) packA = (float *)malloc(K*16*sizeof(float));
+
+  for (m = 0; m < m16; m += 16) {
+    for (n = 0; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0, rb1;
+      register vector float t0, t1, t2, t3;
+
+      if (has_packing) {
+        if (n == 0) {
+          for (k = 0; k < k4; k += 4) {
+            LOAD_AT_16x4(m, k);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12);
+            PACK_A(ra0, ra4, ra8, ra12, 0);
+            LOAD_B_1x8(k+1, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13);
+            PACK_A(ra1, ra5, ra9, ra13, 16);
+            LOAD_B_1x8(k+2, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14);
+            PACK_A(ra2, ra6, ra10, ra14, 32);
+            LOAD_B_1x8(k+3, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15);
+            PACK_A(ra3, ra7, ra11, ra15, 48);
+          }
+          for (; k < k2; k += 2) {
+            LOAD_AT_16x2(m, k);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
+            PACK_A(ra0, ra2, ra4, ra6, 0);
+            LOAD_B_1x8(k+1, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
+            PACK_A(ra1, ra3, ra5, ra7, 16);
+          }
+          for (; k < K; k++) {
+            LOAD_AT_16x1(m, k);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3);
+            PACK_A(ra0, ra1, ra2, ra3, 0);
+          }
+        } else {
+          for (k = 0; k < k4; k += 4) {
+            LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12);
+            LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+            LOAD_B_1x8(k+1, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13);
+            LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+            LOAD_B_1x8(k+2, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14);
+            LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+            LOAD_B_1x8(k+3, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15);
+          }
+          for (; k < k2; k += 2) {
+            LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
+            LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+            LOAD_B_1x8(k+1, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
+          }
+          for (; k < K; k++) {
+            LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+            LOAD_B_1x8(k, n);
+            KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                            ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3);
+          }
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_B_1x8(k, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12);
+          LOAD_B_1x8(k+1, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13);
+          LOAD_B_1x8(k+2, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14);
+          LOAD_B_1x8(k+3, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_B_1x8(k, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6);
+          LOAD_B_1x8(k+1, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_B_1x8(k, n);
+          KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1,
+                          ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+0, m+8);
+      SAVE_4x4_ACC(&acc3, n+0, m+12);
+      SAVE_4x4_ACC(&acc4, n+4, m+0);
+      SAVE_4x4_ACC(&acc5, n+4, m+4);
+      SAVE_4x4_ACC(&acc6, n+4, m+8);
+      SAVE_4x4_ACC(&acc7, n+4, m+12);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          LOAD_B_1x4(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13);
+          LOAD_B_1x4(k+2, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14);
+          LOAD_B_1x4(k+3, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          LOAD_B_1x4(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+          LOAD_B_1x4(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13);
+          LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+          LOAD_B_1x4(k+2, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14);
+          LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+          LOAD_B_1x4(k+3, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+          LOAD_B_1x4(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x4(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+      SAVE_4x4_ACC(&acc2, n+0, m+8);
+      SAVE_4x4_ACC(&acc3, n+0, m+12);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          LOAD_B_1x2(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13);
+          LOAD_B_1x2(k+2, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14);
+          LOAD_B_1x2(k+3, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          LOAD_B_1x2(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12);
+          LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+          LOAD_B_1x2(k+1, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13);
+          LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+          LOAD_B_1x2(k+2, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14);
+          LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+          LOAD_B_1x2(k+3, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_B_1x2(k, n);
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6);
+          LOAD_B_1x2(k+1, n);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x2(k, n);
+          KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+      SAVE_2x4_ACC(&acc2, n, m+8);
+      SAVE_2x4_ACC(&acc3, n, m+12);
+    }
+
+    for (; n < N; n++) {
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9,
+          ra10, ra11, ra12, ra13, ra14, ra15;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+      vector float result2 = ((vector float){0.,0.,0.,0.});
+      vector float result3 = ((vector float){0.,0.,0.,0.});
+
+      if (!has_packing) {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_AT_16x4(m, k);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(k+1, n);
+          KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(k+2, n);
+          KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(k+3, n);
+          KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_AT_16x2(m, k);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0);
+          LOAD_B_1x1(k+1, n);
+          KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0);
+        }
+        for (; k < K; k++) {
+          LOAD_AT_16x1(m, k);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      } else {
+        for (k = 0; k < k4; k += 4) {
+          LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16);
+          LOAD_B_1x1(k+1, n);
+          KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32);
+          LOAD_B_1x1(k+2, n);
+          KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48);
+          LOAD_B_1x1(k+3, n);
+          KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0);
+        }
+        for (; k < k2; k += 2) {
+          LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0);
+          LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16);
+          LOAD_B_1x1(k+1, n);
+          KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0);
+        }
+        for (; k < K; k++) {
+          LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0);
+          LOAD_B_1x1(k, n);
+          KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0);
+        }
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+      SAVE_1x4_VSR(result2, n, m+8);
+      SAVE_1x4_VSR(result3, n, m+12);
+    }
+  }
+
+  for (; m < m8; m += 8) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+      INIT_8ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra0, ra0, ra0, ra0, ra4, ra4, ra4, ra4);
+        LOAD_B_1x16(k+1, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra1, ra1, ra1, ra1, ra5, ra5, ra5, ra5);
+        LOAD_B_1x16(k+2, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra2, ra2, ra2, ra2, ra6, ra6, ra6, ra6);
+        LOAD_B_1x16(k+3, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra3, ra3, ra3, ra3, ra7, ra7, ra7, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra0, ra0, ra0, ra0, ra2, ra2, ra2, ra2);
+        LOAD_B_1x16(k+1, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra1, ra1, ra1, ra1, ra3, ra3, ra3, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3,
+                        ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc4, n+0, m+4);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc5, n+4, m+4);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc6, n+8, m+4);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+      SAVE_4x4_ACC(&acc7, n+12, m+4);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0, rb1;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra4, ra4);
+        LOAD_B_1x8(k+1, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra5, ra5);
+        LOAD_B_1x8(k+2, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra2, ra2, ra6, ra6);
+        LOAD_B_1x8(k+3, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra3, ra3, ra7, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra2, ra2);
+        LOAD_B_1x8(k+1, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra3, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc2, n+0, m+4);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc3, n+4, m+4);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4);
+        LOAD_B_1x4(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5);
+        LOAD_B_1x4(k+2, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6);
+        LOAD_B_1x4(k+3, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2);
+        LOAD_B_1x4(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+0, m+4);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4);
+        LOAD_B_1x2(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5);
+        LOAD_B_1x2(k+2, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6);
+        LOAD_B_1x2(k+3, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2);
+        LOAD_B_1x2(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m+0);
+      SAVE_2x4_ACC(&acc1, n, m+4);
+    }
+
+    for (; n < N; n++) {
+      register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_8x4(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0);
+        LOAD_B_1x1(k+1, n);
+        KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0);
+        LOAD_B_1x1(k+2, n);
+        KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0);
+        LOAD_B_1x1(k+3, n);
+        KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_8x2(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0);
+        LOAD_B_1x1(k+1, n);
+        KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_8x1(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+      SAVE_1x4_VSR(result1, n, m+4);
+    }
+  }
+
+  for (; m < m4; m += 4) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        LOAD_B_1x16(k+1, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1);
+        LOAD_B_1x16(k+2, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2);
+        LOAD_B_1x16(k+3, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        LOAD_B_1x16(k+1, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+      SAVE_4x4_ACC(&acc2, n+8, m+0);
+      SAVE_4x4_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+        LOAD_B_1x8(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1);
+        LOAD_B_1x8(k+2, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2);
+        LOAD_B_1x8(k+3, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+        LOAD_B_1x8(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n+0, m+0);
+      SAVE_4x4_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_B_1x4(k+1, n);
+        KERNEL_MMA_1ACC(rb0, ra1);
+        LOAD_B_1x4(k+2, n);
+        KERNEL_MMA_1ACC(rb0, ra2);
+        LOAD_B_1x4(k+3, n);
+        KERNEL_MMA_1ACC(rb0, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_B_1x4(k+1, n);
+        KERNEL_MMA_1ACC(rb0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x4_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_B_1x2(k+1, n);
+        KERNEL_MMA_1ACC(rb0, ra1);
+        LOAD_B_1x2(k+2, n);
+        KERNEL_MMA_1ACC(rb0, ra2);
+        LOAD_B_1x2(k+3, n);
+        KERNEL_MMA_1ACC(rb0, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_B_1x2(k+1, n);
+        KERNEL_MMA_1ACC(rb0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_2x4_ACC(&acc0, n, m);
+    }
+
+    for (; n < N; n++) {
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+      register vector float t0, t1, t2, t3;
+
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_4x4(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_B_1x1(k+1, n);
+        KERNEL_VMADD_1VSR(ra1, rb0);
+        LOAD_B_1x1(k+2, n);
+        KERNEL_VMADD_1VSR(ra2, rb0);
+        LOAD_B_1x1(k+3, n);
+        KERNEL_VMADD_1VSR(ra3, rb0);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_4x2(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+        LOAD_B_1x1(k+1, n);
+        KERNEL_VMADD_1VSR(ra1, rb0);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_4x1(m, k);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x4_VSR(result, n, m);
+    }
+  }
+
+  for (; m < m2; m += 2) {
+    for (n = 0; n < n16; n += 16) {
+      __vector_quad acc0, acc1, acc2, acc3;
+
+      INIT_4ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1, rb2, rb3;
+      register vector float t0, t1, t2;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_2x4(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        LOAD_B_1x16(k+1, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1);
+        LOAD_B_1x16(k+2, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2);
+        LOAD_B_1x16(k+3, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+        LOAD_B_1x16(k+1, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+      SAVE_4x2_ACC(&acc2, n+8, m+0);
+      SAVE_4x2_ACC(&acc3, n+12, m+0);
+    }
+
+    for (; n < n8; n += 8) {
+      __vector_quad acc0, acc1;
+
+      INIT_2ACCS();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0, rb1;
+      register vector float t0, t1, t2;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_2x4(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+        LOAD_B_1x8(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1);
+        LOAD_B_1x8(k+2, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2);
+        LOAD_B_1x8(k+3, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+        LOAD_B_1x8(k+1, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n+0, m+0);
+      SAVE_4x2_ACC(&acc1, n+4, m+0);
+    }
+
+    for (; n < n4; n += 4) {
+      __vector_quad acc0;
+
+      INIT_1ACC();
+
+      register vector float ra0, ra1, ra2, ra3;
+      register vector float rb0;
+      register vector float t0, t1, t2;
+
+      for (k = 0; k < k4; k += 4) {
+        LOAD_AT_2x4(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_B_1x4(k+1, n);
+        KERNEL_MMA_1ACC(rb0, ra1);
+        LOAD_B_1x4(k+2, n);
+        KERNEL_MMA_1ACC(rb0, ra2);
+        LOAD_B_1x4(k+3, n);
+        KERNEL_MMA_1ACC(rb0, ra3);
+      }
+      for (; k < k2; k += 2) {
+        LOAD_AT_2x2(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+        LOAD_B_1x4(k+1, n);
+        KERNEL_MMA_1ACC(rb0, ra1);
+      }
+      for (; k < K; k++) {
+        LOAD_AT_2x1(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_MMA_1ACC(rb0, ra0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      vector float result[4];
+      SAVE_4x2_ACC(&acc0, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_2x2(m, k);
+        LOAD_B_2x2(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_2x2_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        ra0 = vec_insert(A[(m+1)*lda+k], ra0, 1);
+        ra0 = vec_insert(A[(m+1)*lda+k], ra0, 3);
+        LOAD_B_1x1(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+#if !defined(B0)
+      register vector float rc0;
+#endif
+      SAVE_1x2_VSR(result, n, m);
+    }
+  }
+
+  for (; m < M; m++) {
+    for (n = 0; n < n16; n += 16) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+      vector float result2 = ((vector float){0.,0.,0.,0.});
+      vector float result3 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0, rb1, rb2, rb3;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x16(k, n);
+        KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+      SAVE_4x1_VSR(result2, n+8, m);
+      SAVE_4x1_VSR(result3, n+12, m);
+    }
+
+    for (; n < n8; n += 8) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+      vector float result1 = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0, rb1;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x8(k, n);
+        KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1);
+      }
+
+      SAVE_4x1_VSR(result, n+0, m);
+      SAVE_4x1_VSR(result1, n+4, m);
+    }
+
+    for (; n < n4; n += 4) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x4(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_4x1_VSR(result, n, m);
+    }
+
+    for (; n < n2; n += 2) {
+      vector float result = ((vector float){0.,0.,0.,0.});
+
+      register vector float ra0;
+      register vector float rb0;
+
+      for (k = 0; k < K; k++) {
+        LOAD_A_1x1(m, k);
+        LOAD_B_1x2(k, n);
+        KERNEL_VMADD_1VSR(ra0, rb0);
+      }
+
+      SAVE_2x1_VSR(result, n, m);
+    }
+
+    for (; n < N; n++) {
+      FLOAT result = 0.0f;
+
+      for (k = 0; k < K; k++) {
+        result += A[m*lda+k] * B[k*ldb+n];
+      }
+      result = result * alpha;
+
+#if !defined(B0)
+      C[n*ldc+m] = (C[n*ldc+m] * beta) + result;
+#else
+      C[n*ldc+m] = result;
+#endif
+    }
+  }
+
+  if (has_packing) free (packA);
+
+  return 0;
+}
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index a53342f61..3e4f93e2a 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "srot_microk_power8.c"
+#elif defined(POWER10)
+#include "srot_microk_power10.c"
 #endif
 #endif
 
@@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
+#if defined(POWER10)
+		if ( n >= 16 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				temp  = c*x[i] + s*y[i] ;
+				y[i]  = c*y[i] - s*x[i] ;
+				x[i]  = temp ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -16;
+		if ( n1 > 0 )
+		{
+			srot_kernel_16(n1, &x1[i], &y1[i], c, s);
+			i+=n1;
+		}
+#else
 		BLASLONG n1 = n & -16;
 		if ( n1 > 0 )
 		{
@@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 			i=n1;
 		}
 
+#endif
 		while(i < n)
 		{
 			temp  = c*x[i] + s*y[i] ;
diff --git a/kernel/power/srot_microk_power10.c b/kernel/power/srot_microk_power10.c
new file mode 100644
index 000000000..c54c30742
--- /dev/null
+++ b/kernel/power/srot_microk_power10.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
+{
+  __asm__
+    (
+       "xscvdpspn       36, %x5        \n\t"   // load c to all words
+       "xxspltw         36, 36, 0       \n\t"
+
+       "xscvdpspn       37, %x6        \n\t"   // load s to all words
+       "xxspltw         37, 37, 0       \n\t"
+       "lxvp            32, 0(%3)       \n\t"   // load x
+       "lxvp            34, 32(%3)      \n\t"
+       "lxvp            48, 0(%4)       \n\t"   // load y
+       "lxvp            50, 32(%4)      \n\t"
+
+       "addic.		%2, %2, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmulsp		40, 32, 36	\n\t"	// c * x
+       "xvmulsp		41, 33, 36	\n\t"
+       "xvmulsp		42, 34, 36	\n\t"
+       "xvmulsp		43, 35, 36	\n\t"
+
+       "xvmulsp		44, 32, 37	\n\t"	// s * x
+       "xvmulsp		45, 33, 37	\n\t"
+       "xvmulsp		46, 34, 37	\n\t"
+       "xvmulsp		47, 35, 37	\n\t"
+
+       "lxvp            32, 64(%3)       \n\t"   // load x
+       "lxvp            34, 96(%3)      \n\t"
+       "xvmulsp		52, 48, 36	\n\t"	// c * y
+       "xvmulsp		53, 49, 36	\n\t"
+       "xvmulsp		54, 50, 36	\n\t"
+       "xvmulsp		55, 51, 36	\n\t"
+
+       "xvmulsp		38, 48, 37	\n\t"	// s * y
+       "xvmulsp		39, 49, 37	\n\t"
+       "xvmulsp		56, 50, 37	\n\t"
+       "xvmulsp		57, 51, 37	\n\t"
+
+       "lxvp            48, 64(%4)       \n\t"   // load y
+       "lxvp            50, 96(%4)      \n\t"
+
+       "xvaddsp		40, 40, 38	\n\t"	// c * x + s * y
+       "xvaddsp		41, 41, 39	\n\t"	// c * x + s * y
+       "xvaddsp		42, 42, 56	\n\t"	// c * x + s * y
+       "xvaddsp		43, 43, 57	\n\t"	// c * x + s * y
+
+       "stxvp           40, 0(%3)       \n\t"   // store x
+       "stxvp           42, 32(%3)      \n\t"
+
+       "xvsubsp         52, 52, 44      \n\t"   // c * y - s * x
+       "xvsubsp         53, 53, 45      \n\t"   // c * y - s * x
+       "xvsubsp         54, 54, 46      \n\t"   // c * y - s * x
+       "xvsubsp         55, 55, 47      \n\t"   // c * y - s * x
+
+       "stxvp           52, 0(%4)       \n\t"   // store y
+       "stxvp           54, 32(%4)      \n\t"
+
+       "addi		%3, %3, 64	\n\t"
+       "addi		%4, %4, 64	\n\t"
+
+       "addic.		%2, %2, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmulsp		40, 32, 36	\n\t"	// c * x
+       "xvmulsp		41, 33, 36	\n\t"
+       "xvmulsp		42, 34, 36	\n\t"
+       "xvmulsp		43, 35, 36	\n\t"
+
+       "xvmulsp         52, 48, 36      \n\t"   // c * y
+       "xvmulsp         53, 49, 36      \n\t"
+       "xvmulsp         54, 50, 36      \n\t"
+       "xvmulsp         55, 51, 36      \n\t"
+
+       "xvmulsp         44, 32, 37      \n\t"   // s * x
+       "xvmulsp         45, 33, 37      \n\t"
+       "xvmulsp         46, 34, 37      \n\t"
+       "xvmulsp         47, 35, 37      \n\t"
+
+       "xvmulsp         38, 48, 37     \n\t"   // s * y
+       "xvmulsp         39, 49, 37     \n\t"
+       "xvmulsp         56, 50, 37     \n\t"
+       "xvmulsp         57, 51, 37     \n\t"
+
+       "xvaddsp         40, 40, 38     \n\t"   // c * x + s * y
+       "xvaddsp         41, 41, 39     \n\t"   // c * x + s * y
+       "xvaddsp         42, 42, 56     \n\t"   // c * x + s * y
+       "xvaddsp         43, 43, 57     \n\t"   // c * x + s * y
+
+       "stxvp           40, 0(%3)       \n\t"   // store x
+       "stxvp           42, 32(%3)      \n\t"
+       "xvsubsp         52, 52, 44      \n\t"   // c * y - s * x
+       "xvsubsp         53, 53, 45      \n\t"   // c * y - s * x
+       "xvsubsp         54, 54, 46      \n\t"   // c * y - s * x
+       "xvsubsp         55, 55, 47      \n\t"   // c * y - s * x
+
+       "stxvp           52, 0(%4)       \n\t"   // store y
+       "stxvp           54, 32(%4)      \n\t"
+
+     "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y) 	// 4
+     :
+       "f" (c),		// 5 
+       "f" (s)		// 6 
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57"
+     );
+}
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index de37e10a5..65572a8c1 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "sscal_microk_power8.c"
+#elif defined(POWER10)
+#include "sscal_microk_power10.c"
 #endif
 #endif
 
@@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		if ( da == 0.0 )
 		{		
 
+#if defined(POWER10)
+			if ( n >= 32 )
+			{
+				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
+				for (j = 0; j < align; j++) {
+					x[j] = 0.0;
+				}
+			}
+			BLASLONG n1 = (n-j) & -32;
+			if ( n1 > 0 )
+			{
+				sscal_kernel_16_zero(n1, &x[j]);
+				j+=n1;
+			}
+#else
 			BLASLONG n1 = n & -32;
 			if ( n1 > 0 )
 			{
 				sscal_kernel_16_zero(n1, x);
 				j=n1;
 			}
+#endif
 
 			while(j < n)
 			{
@@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		else
 		{
 
+#if defined(POWER10)
+			if ( n >= 32 )
+			{
+				BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
+				for (j = 0; j < align; j++) {
+					x[j] = da * x[j];
+				}
+			}
+			BLASLONG n1 = (n-j) & -32;
+			if ( n1 > 0 )
+			{
+				sscal_kernel_16(n1, &x[j], da);
+				j+=n1;
+			}
+#else
 			BLASLONG n1 = n & -32;
 			if ( n1 > 0 )
 			{
 				sscal_kernel_16(n1, x, da);
 				j=n1;
 			}
+#endif
 			while(j < n)
 			{
 
diff --git a/kernel/power/sscal_microk_power10.c b/kernel/power/sscal_microk_power10.c
new file mode 100644
index 000000000..a523a1675
--- /dev/null
+++ b/kernel/power/sscal_microk_power10.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void sscal_kernel_16 (long n, float *x, float alpha)
+{
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xscvdpspn	48, %x3	\n\t"
+       "xxspltw		48, 48, 0	\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		36, 64(%2)	\n\t"
+       "lxvp		38, 96(%2)	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmulsp		40, 32, 48	\n\t"
+       "xvmulsp		41, 33, 48	\n\t"
+       "xvmulsp		42, 34, 48	\n\t"
+       "xvmulsp		43, 35, 48	\n\t"
+       "lxvp		32, 128(%2)	\n\t"
+       "lxvp		34, 160(%2)	\n\t"
+       "xvmulsp		44, 36, 48	\n\t"
+       "xvmulsp		45, 37, 48	\n\t"
+       "xvmulsp		46, 38, 48	\n\t"
+       "xvmulsp		47, 39, 48	\n\t"
+       "lxvp		36, 192(%2)	\n\t"
+       "lxvp		38, 224(%2)	\n\t"
+
+       "stxvp		40, 0(%2)	\n\t"
+       "stxvp		42, 32(%2)	\n\t"
+       "stxvp		44, 64(%2)	\n\t"
+       "stxvp		46, 96(%2)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmulsp		40, 32, 48	\n\t"
+       "xvmulsp		41, 33, 48	\n\t"
+       "xvmulsp		42, 34, 48	\n\t"
+       "xvmulsp		43, 35, 48	\n\t"
+
+       "xvmulsp		44, 36, 48	\n\t"
+       "xvmulsp		45, 37, 48	\n\t"
+       "xvmulsp		46, 38, 48	\n\t"
+       "xvmulsp		47, 39, 48	\n\t"
+
+       "stxvp		40, 0(%2)	\n\t"
+       "stxvp		42, 32(%2)	\n\t"
+       "stxvp		44, 64(%2)	\n\t"
+       "stxvp		46, 96(%2)	\n\t"
+
+     "#n=%1 alpha=%3 x=%0=%2"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x)		// 2
+     :
+       "f" (alpha)	// 3
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
+     );
+}
+
+
+static void sscal_kernel_16_zero (long n, float *x)
+{
+
+  __asm__
+    (
+       "xxlxor		32, 32, 32	\n\t"
+       "xxlxor		33, 33, 33	\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "stxvp		32, 0(%2)	\n\t"
+       "stxvp		32, 32(%2)	\n\t"
+       "stxvp		32, 64(%2)	\n\t"
+       "stxvp		32, 96(%2)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "#n=%1 x=%0=%2 "
+     :
+       "=m" (*x),
+       "+r" (n),	// 1
+       "+b" (x)	// 2
+     :
+     :
+       "cr0","vs32","vs33"
+     );
+}
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
index 44522f0a0..dd249fd36 100644
--- a/kernel/power/sswap.c
+++ b/kernel/power/sswap.c
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "sswap_microk_power8.c"
+#elif defined(POWER10)
+#include "swap_microk_power10.c"
 #endif
 #endif
 
@@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	if ( (inc_x == 1) && (inc_y == 1 ))
 	{
 
+#if defined(POWER10)
+		if ( n >= 64 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				temp = y[i];
+				y[i] = x[i];
+				x[i] = temp;
+			}
+		}
+		BLASLONG n1 = (n-i) & -64;
+		if ( n1 > 0 )
+		{
+			sswap_kernel_32(n1,&x[i], &y[i]);
+			i+=n1;
+		}
+#else
 		BLASLONG n1 = n & -32;
 		if ( n1 > 0 )
 		{
 			sswap_kernel_32(n1, x, y);
 			i=n1;
 		}
+#endif
 
 		while(i < n)
 		{
diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c
new file mode 100644
index 000000000..f9c1fee52
--- /dev/null
+++ b/kernel/power/swap_microk_power10.c
@@ -0,0 +1,105 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define HAVE_KERNEL_32 1
+
+#if defined(DOUBLE)
+static void dswap_kernel_32 (long n, double *x, double *y)
+#else
+static void sswap_kernel_32 (long n, float *x, float *y)
+#endif
+{
+  __asm__
+    (
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "lxvp		32, 0(%4)	\n\t"
+       "lxvp		34, 32(%4)	\n\t"
+       "lxvp		36, 64(%4)	\n\t"
+       "lxvp		38, 96(%4)	\n\t"
+
+       "lxvp		40, 128(%4)	\n\t"
+       "lxvp		42, 160(%4)	\n\t"
+       "lxvp		44, 192(%4)	\n\t"
+       "lxvp		46, 224(%4)	\n\t"
+
+       "lxvp		48, 0(%3)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "lxvp		56, 128(%3)	\n\t"
+       "lxvp		58, 160(%3)	\n\t"
+       "lxvp		60, 192(%3)	\n\t"
+       "lxvp		62, 224(%3)	\n\t"
+
+       "stxvp		32, 0(%3)	\n\t"
+       "stxvp		34, 32(%3)	\n\t"
+       "stxvp		36, 64(%3)	\n\t"
+       "stxvp		38, 96(%3)	\n\t"
+
+       "stxvp		40, 128(%3)	\n\t"
+       "stxvp		42, 160(%3)	\n\t"
+       "stxvp		44, 192(%3)	\n\t"
+       "stxvp		46, 224(%3)	\n\t"
+
+       "stxvp		48, 0(%4)	\n\t"
+       "stxvp		50, 32(%4)	\n\t"
+       "stxvp		52, 64(%4)	\n\t"
+       "stxvp		54, 96(%4)	\n\t"
+
+       "stxvp		56, 128(%4)	\n\t"
+       "stxvp		58, 160(%4)	\n\t"
+       "stxvp		60, 192(%4)	\n\t"
+       "stxvp		62, 224(%4)	\n\t"
+
+       "addi		%4, %4, 256	\n\t"
+       "addi		%3, %3, 256	\n\t"
+
+#if defined(DOUBLE)
+       "addic.		%2, %2, -32	\n\t"
+#else
+       "addic.		%2, %2, -64	\n\t"
+#endif
+       "bgt		one%=		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+}
diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c
new file mode 100644
index 000000000..246c3a236
--- /dev/null
+++ b/kernel/power/trsm_kernel_LN_power10.c
@@ -0,0 +1,1279 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[56] = (c0[7] *= a[63]);
+   b[57] = (c1[7] *= a[63]);
+   b[58] = (c2[7] *= a[63]);
+   b[59] = (c3[7] *= a[63]);
+   b[60] = (c4[7] *= a[63]);
+   b[61] = (c5[7] *= a[63]);
+   b[62] = (c6[7] *= a[63]);
+   b[63] = (c7[7] *= a[63]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[29], 0);
+   VbS3 = vec_splat(Vb[29], 1);
+   VbS4 = vec_splat(Vb[30], 0);
+   VbS5 = vec_splat(Vb[30], 1);
+   VbS6 = vec_splat(Vb[31], 0);
+   VbS7 = vec_splat(Vb[31], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]);
+   c0[6] -= c0[7] * a[62];
+   c1[6] -= c1[7] * a[62];
+   c2[6] -= c2[7] * a[62];
+   c3[6] -= c3[7] * a[62];
+   c4[6] -= c4[7] * a[62];
+   c5[6] -= c5[7] * a[62];
+   c6[6] -= c6[7] * a[62];
+   c7[6] -= c7[7] * a[62];
+
+   b[48] = (c0[6] *= a[54]);
+   b[49] = (c1[6] *= a[54]);
+   b[50] = (c2[6] *= a[54]);
+   b[51] = (c3[6] *= a[54]);
+   b[52] = (c4[6] *= a[54]);
+   b[53] = (c5[6] *= a[54]);
+   b[54] = (c6[6] *= a[54]);
+   b[55] = (c7[6] *= a[54]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[25], 0);
+   VbS3 = vec_splat(Vb[25], 1);
+   VbS4 = vec_splat(Vb[26], 0);
+   VbS5 = vec_splat(Vb[26], 1);
+   VbS6 = vec_splat(Vb[27], 0);
+   VbS7 = vec_splat(Vb[27], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]);
+
+   b[40] = (c0[5] *= a[45]);
+   b[41] = (c1[5] *= a[45]);
+   b[42] = (c2[5] *= a[45]);
+   b[43] = (c3[5] *= a[45]);
+   b[44] = (c4[5] *= a[45]);
+   b[45] = (c5[5] *= a[45]);
+   b[46] = (c6[5] *= a[45]);
+   b[47] = (c7[5] *= a[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   VbS5 = vec_splat(Vb[22], 1);
+   VbS6 = vec_splat(Vb[23], 0);
+   VbS7 = vec_splat(Vb[23], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]);
+   c0[4] -= c0[5] * a[44];
+   c1[4] -= c1[5] * a[44];
+   c2[4] -= c2[5] * a[44];
+   c3[4] -= c3[5] * a[44];
+   c4[4] -= c4[5] * a[44];
+   c5[4] -= c5[5] * a[44];
+   c6[4] -= c6[5] * a[44];
+   c7[4] -= c7[5] * a[44];
+
+   b[32] = (c0[4] *= a[36]);
+   b[33] = (c1[4] *= a[36]);
+   b[34] = (c2[4] *= a[36]);
+   b[35] = (c3[4] *= a[36]);
+   b[36] = (c4[4] *= a[36]);
+   b[37] = (c5[4] *= a[36]);
+   b[38] = (c6[4] *= a[36]);
+   b[39] = (c7[4] *= a[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   VbS4 = vec_splat(Vb[18], 0);
+   VbS5 = vec_splat(Vb[18], 1);
+   VbS6 = vec_splat(Vb[19], 0);
+   VbS7 = vec_splat(Vb[19], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]);
+   
+   b[24] = (c0[3] *= a[27]);
+   b[25] = (c1[3] *= a[27]);
+   b[26] = (c2[3] *= a[27]);
+   b[27] = (c3[3] *= a[27]);
+   b[28] = (c4[3] *= a[27]);
+   b[29] = (c5[3] *= a[27]);
+   b[30] = (c6[3] *= a[27]);
+   b[31] = (c7[3] *= a[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   VbS3 = vec_splat(Vb[13], 1);
+   VbS4 = vec_splat(Vb[14], 0);
+   VbS5 = vec_splat(Vb[14], 1);
+   VbS6 = vec_splat(Vb[15], 0);
+   VbS7 = vec_splat(Vb[15], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]);
+   c0[2] -= c0[3] * a[26];
+   c1[2] -= c1[3] * a[26];
+   c2[2] -= c2[3] * a[26];
+   c3[2] -= c3[3] * a[26];
+   c4[2] -= c4[3] * a[26];
+   c5[2] -= c5[3] * a[26];
+   c6[2] -= c6[3] * a[26];
+   c7[2] -= c7[3] * a[26];
+
+   b[16] = (c0[2] *= a[18]);
+   b[17] = (c1[2] *= a[18]);
+   b[18] = (c2[2] *= a[18]);
+   b[19] = (c3[2] *= a[18]);
+   b[20] = (c4[2] *= a[18]);
+   b[21] = (c5[2] *= a[18]);
+   b[22] = (c6[2] *= a[18]);
+   b[23] = (c7[2] *= a[18]);
+   VbS0 = vec_splat(Vb[ 8], 0);
+   VbS1 = vec_splat(Vb[ 8], 1);
+   VbS2 = vec_splat(Vb[ 9], 0);
+   VbS3 = vec_splat(Vb[ 9], 1);
+   VbS4 = vec_splat(Vb[10], 0);
+   VbS5 = vec_splat(Vb[10], 1);
+   VbS6 = vec_splat(Vb[11], 0);
+   VbS7 = vec_splat(Vb[11], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]);
+
+   b[ 8] = (c0[1] *= a[9]);
+   b[ 9] = (c1[1] *= a[9]);
+   b[10] = (c2[1] *= a[9]);
+   b[11] = (c3[1] *= a[9]);
+   b[12] = (c4[1] *= a[9]);
+   b[13] = (c5[1] *= a[9]);
+   b[14] = (c6[1] *= a[9]);
+   b[15] = (c7[1] *= a[9]);
+   c0[0] -= c0[1] * a[8];
+   c1[0] -= c1[1] * a[8];
+   c2[0] -= c2[1] * a[8];
+   c3[0] -= c3[1] * a[8];
+   c4[0] -= c4[1] * a[8];
+   c5[0] -= c5[1] * a[8];
+   c6[0] -= c6[1] * a[8];
+   c7[0] -= c7[1] * a[8];
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[120] = (c0[15] *= a[255]);
+   b[121] = (c1[15] *= a[255]);
+   b[122] = (c2[15] *= a[255]);
+   b[123] = (c3[15] *= a[255]);
+   b[124] = (c4[15] *= a[255]);
+   b[125] = (c5[15] *= a[255]);
+   b[126] = (c6[15] *= a[255]);
+   b[127] = (c7[15] *= a[255]);
+   VbS0 = vec_splat(Vb[30], 0);
+   VbS1 = vec_splat(Vb[30], 1);
+   VbS2 = vec_splat(Vb[30], 2);
+   VbS3 = vec_splat(Vb[30], 3);
+   VbS4 = vec_splat(Vb[31], 0);
+   VbS5 = vec_splat(Vb[31], 1);
+   VbS6 = vec_splat(Vb[31], 2);
+   VbS7 = vec_splat(Vb[31], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]);
+   c0[12] -= b[120] * a[252];
+   c0[13] -= b[120] * a[253];
+   c0[14] -= b[120] * a[254];
+   c1[12] -= b[121] * a[252];
+   c1[13] -= b[121] * a[253];
+   c1[14] -= b[121] * a[254];
+   c2[12] -= b[122] * a[252];
+   c2[13] -= b[122] * a[253];
+   c2[14] -= b[122] * a[254];
+   c3[12] -= b[123] * a[252];
+   c3[13] -= b[123] * a[253];
+   c3[14] -= b[123] * a[254];
+   c4[12] -= b[124] * a[252];
+   c4[13] -= b[124] * a[253];
+   c4[14] -= b[124] * a[254];
+   c5[12] -= b[125] * a[252];
+   c5[13] -= b[125] * a[253];
+   c5[14] -= b[125] * a[254];
+   c6[12] -= b[126] * a[252];
+   c6[13] -= b[126] * a[253];
+   c6[14] -= b[126] * a[254];
+   c7[12] -= b[127] * a[252];
+   c7[13] -= b[127] * a[253];
+   c7[14] -= b[127] * a[254];
+
+   b[112] = (c0[14] *= a[238]);
+   b[113] = (c1[14] *= a[238]);
+   b[114] = (c2[14] *= a[238]);
+   b[115] = (c3[14] *= a[238]);
+   b[116] = (c4[14] *= a[238]);
+   b[117] = (c5[14] *= a[238]);
+   b[118] = (c6[14] *= a[238]);
+   b[119] = (c7[14] *= a[238]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[28], 2);
+   VbS3 = vec_splat(Vb[28], 3);
+   VbS4 = vec_splat(Vb[29], 0);
+   VbS5 = vec_splat(Vb[29], 1);
+   VbS6 = vec_splat(Vb[29], 2);
+   VbS7 = vec_splat(Vb[29], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]);
+   c0[12] -= b[112] * a[236];
+   c0[13] -= b[112] * a[237];
+   c1[12] -= b[113] * a[236];
+   c1[13] -= b[113] * a[237];
+   c2[12] -= b[114] * a[236];
+   c2[13] -= b[114] * a[237];
+   c3[12] -= b[115] * a[236];
+   c3[13] -= b[115] * a[237];
+   c4[12] -= b[116] * a[236];
+   c4[13] -= b[116] * a[237];
+   c5[12] -= b[117] * a[236];
+   c5[13] -= b[117] * a[237];
+   c6[12] -= b[118] * a[236];
+   c6[13] -= b[118] * a[237];
+   c7[12] -= b[119] * a[236];
+   c7[13] -= b[119] * a[237];
+
+   b[104] = (c0[13] *= a[221]);
+   b[105] = (c1[13] *= a[221]);
+   b[106] = (c2[13] *= a[221]);
+   b[107] = (c3[13] *= a[221]);
+   b[108] = (c4[13] *= a[221]);
+   b[109] = (c5[13] *= a[221]);
+   b[110] = (c6[13] *= a[221]);
+   b[111] = (c7[13] *= a[221]);
+   VbS0 = vec_splat(Vb[26], 0);
+   VbS1 = vec_splat(Vb[26], 1);
+   VbS2 = vec_splat(Vb[26], 2);
+   VbS3 = vec_splat(Vb[26], 3);
+   VbS4 = vec_splat(Vb[27], 0);
+   VbS5 = vec_splat(Vb[27], 1);
+   VbS6 = vec_splat(Vb[27], 2);
+   VbS7 = vec_splat(Vb[27], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]);
+   c0[12] -= b[104] * a[220];
+   c1[12] -= b[105] * a[220];
+   c2[12] -= b[106] * a[220];
+   c3[12] -= b[107] * a[220];
+   c4[12] -= b[108] * a[220];
+   c5[12] -= b[109] * a[220];
+   c6[12] -= b[110] * a[220];
+   c7[12] -= b[111] * a[220];
+
+   b[ 96] = (c0[12] *= a[204]);
+   b[ 97] = (c1[12] *= a[204]);
+   b[ 98] = (c2[12] *= a[204]);
+   b[ 99] = (c3[12] *= a[204]);
+   b[100] = (c4[12] *= a[204]);
+   b[101] = (c5[12] *= a[204]);
+   b[102] = (c6[12] *= a[204]);
+   b[103] = (c7[12] *= a[204]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[24], 2);
+   VbS3 = vec_splat(Vb[24], 3);
+   VbS4 = vec_splat(Vb[25], 0);
+   VbS5 = vec_splat(Vb[25], 1);
+   VbS6 = vec_splat(Vb[25], 2);
+   VbS7 = vec_splat(Vb[25], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]);
+
+   b[88] = (c0[11] *= a[187]);
+   b[89] = (c1[11] *= a[187]);
+   b[90] = (c2[11] *= a[187]);
+   b[91] = (c3[11] *= a[187]);
+   b[92] = (c4[11] *= a[187]);
+   b[93] = (c5[11] *= a[187]);
+   b[94] = (c6[11] *= a[187]);
+   b[95] = (c7[11] *= a[187]);
+   VbS0 = vec_splat(Vb[22], 0);
+   VbS1 = vec_splat(Vb[22], 1);
+   VbS2 = vec_splat(Vb[22], 2);
+   VbS3 = vec_splat(Vb[22], 3);
+   VbS4 = vec_splat(Vb[23], 0);
+   VbS5 = vec_splat(Vb[23], 1);
+   VbS6 = vec_splat(Vb[23], 2);
+   VbS7 = vec_splat(Vb[23], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]);
+   c0[ 8] -= b[88] * a[184];
+   c0[ 9] -= b[88] * a[185];
+   c0[10] -= b[88] * a[186];
+   c1[ 8] -= b[89] * a[184];
+   c1[ 9] -= b[89] * a[185];
+   c1[10] -= b[89] * a[186];
+   c2[ 8] -= b[90] * a[184];
+   c2[ 9] -= b[90] * a[185];
+   c2[10] -= b[90] * a[186];
+   c3[ 8] -= b[91] * a[184];
+   c3[ 9] -= b[91] * a[185];
+   c3[10] -= b[91] * a[186];
+   c4[ 8] -= b[92] * a[184];
+   c4[ 9] -= b[92] * a[185];
+   c4[10] -= b[92] * a[186];
+   c5[ 8] -= b[93] * a[184];
+   c5[ 9] -= b[93] * a[185];
+   c5[10] -= b[93] * a[186];
+   c6[ 8] -= b[94] * a[184];
+   c6[ 9] -= b[94] * a[185];
+   c6[10] -= b[94] * a[186];
+   c7[ 8] -= b[95] * a[184];
+   c7[ 9] -= b[95] * a[185];
+   c7[10] -= b[95] * a[186];
+
+   b[80] = (c0[10] *= a[170]);
+   b[81] = (c1[10] *= a[170]);
+   b[82] = (c2[10] *= a[170]);
+   b[83] = (c3[10] *= a[170]);
+   b[84] = (c4[10] *= a[170]);
+   b[85] = (c5[10] *= a[170]);
+   b[86] = (c6[10] *= a[170]);
+   b[87] = (c7[10] *= a[170]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[20], 2);
+   VbS3 = vec_splat(Vb[20], 3);
+   VbS4 = vec_splat(Vb[21], 0);
+   VbS5 = vec_splat(Vb[21], 1);
+   VbS6 = vec_splat(Vb[21], 2);
+   VbS7 = vec_splat(Vb[21], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]);
+   c0[8] -= b[80] * a[168];
+   c0[9] -= b[80] * a[169];
+   c1[8] -= b[81] * a[168];
+   c1[9] -= b[81] * a[169];
+   c2[8] -= b[82] * a[168];
+   c2[9] -= b[82] * a[169];
+   c3[8] -= b[83] * a[168];
+   c3[9] -= b[83] * a[169];
+   c4[8] -= b[84] * a[168];
+   c4[9] -= b[84] * a[169];
+   c5[8] -= b[85] * a[168];
+   c5[9] -= b[85] * a[169];
+   c6[8] -= b[86] * a[168];
+   c6[9] -= b[86] * a[169];
+   c7[8] -= b[87] * a[168];
+   c7[9] -= b[87] * a[169];
+
+   b[72] = (c0[9] *= a[153]);
+   b[73] = (c1[9] *= a[153]);
+   b[74] = (c2[9] *= a[153]);
+   b[75] = (c3[9] *= a[153]);
+   b[76] = (c4[9] *= a[153]);
+   b[77] = (c5[9] *= a[153]);
+   b[78] = (c6[9] *= a[153]);
+   b[79] = (c7[9] *= a[153]);
+   VbS0 = vec_splat(Vb[18], 0);
+   VbS1 = vec_splat(Vb[18], 1);
+   VbS2 = vec_splat(Vb[18], 2);
+   VbS3 = vec_splat(Vb[18], 3);
+   VbS4 = vec_splat(Vb[19], 0);
+   VbS5 = vec_splat(Vb[19], 1);
+   VbS6 = vec_splat(Vb[19], 2);
+   VbS7 = vec_splat(Vb[19], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]);
+   c0[8] -= b[72] * a[152];
+   c1[8] -= b[73] * a[152];
+   c2[8] -= b[74] * a[152];
+   c3[8] -= b[75] * a[152];
+   c4[8] -= b[76] * a[152];
+   c5[8] -= b[77] * a[152];
+   c6[8] -= b[78] * a[152];
+   c7[8] -= b[79] * a[152];
+
+   b[64] = (c0[8] *= a[136]);
+   b[65] = (c1[8] *= a[136]);
+   b[66] = (c2[8] *= a[136]);
+   b[67] = (c3[8] *= a[136]);
+   b[68] = (c4[8] *= a[136]);
+   b[69] = (c5[8] *= a[136]);
+   b[70] = (c6[8] *= a[136]);
+   b[71] = (c7[8] *= a[136]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[16], 2);
+   VbS3 = vec_splat(Vb[16], 3);
+   VbS4 = vec_splat(Vb[17], 0);
+   VbS5 = vec_splat(Vb[17], 1);
+   VbS6 = vec_splat(Vb[17], 2);
+   VbS7 = vec_splat(Vb[17], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]);
+
+   b[56] = (c0[7] *= a[119]);
+   b[57] = (c1[7] *= a[119]);
+   b[58] = (c2[7] *= a[119]);
+   b[59] = (c3[7] *= a[119]);
+   b[60] = (c4[7] *= a[119]);
+   b[61] = (c5[7] *= a[119]);
+   b[62] = (c6[7] *= a[119]);
+   b[63] = (c7[7] *= a[119]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]);
+   c0[4] -= b[56] * a[116];
+   c0[5] -= b[56] * a[117];
+   c0[6] -= b[56] * a[118];
+   c1[4] -= b[57] * a[116];
+   c1[5] -= b[57] * a[117];
+   c1[6] -= b[57] * a[118];
+   c2[4] -= b[58] * a[116];
+   c2[5] -= b[58] * a[117];
+   c2[6] -= b[58] * a[118];
+   c3[4] -= b[59] * a[116];
+   c3[5] -= b[59] * a[117];
+   c3[6] -= b[59] * a[118];
+   c4[4] -= b[60] * a[116];
+   c4[5] -= b[60] * a[117];
+   c4[6] -= b[60] * a[118];
+   c5[4] -= b[61] * a[116];
+   c5[5] -= b[61] * a[117];
+   c5[6] -= b[61] * a[118];
+   c6[4] -= b[62] * a[116];
+   c6[5] -= b[62] * a[117];
+   c6[6] -= b[62] * a[118];
+   c7[4] -= b[63] * a[116];
+   c7[5] -= b[63] * a[117];
+   c7[6] -= b[63] * a[118];
+
+   b[48] = (c0[6] *= a[102]);
+   b[49] = (c1[6] *= a[102]);
+   b[50] = (c2[6] *= a[102]);
+   b[51] = (c3[6] *= a[102]);
+   b[52] = (c4[6] *= a[102]);
+   b[53] = (c5[6] *= a[102]);
+   b[54] = (c6[6] *= a[102]);
+   b[55] = (c7[6] *= a[102]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+   VbS7 = vec_splat(Vb[13], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]);
+   c0[4] -= b[48] * a[100];
+   c0[5] -= b[48] * a[101];
+   c1[4] -= b[49] * a[100];
+   c1[5] -= b[49] * a[101];
+   c2[4] -= b[50] * a[100];
+   c2[5] -= b[50] * a[101];
+   c3[4] -= b[51] * a[100];
+   c3[5] -= b[51] * a[101];
+   c4[4] -= b[52] * a[100];
+   c4[5] -= b[52] * a[101];
+   c5[4] -= b[53] * a[100];
+   c5[5] -= b[53] * a[101];
+   c6[4] -= b[54] * a[100];
+   c6[5] -= b[54] * a[101];
+   c7[4] -= b[55] * a[100];
+   c7[5] -= b[55] * a[101];
+
+   b[40] = (c0[5] *= a[85]);
+   b[41] = (c1[5] *= a[85]);
+   b[42] = (c2[5] *= a[85]);
+   b[43] = (c3[5] *= a[85]);
+   b[44] = (c4[5] *= a[85]);
+   b[45] = (c5[5] *= a[85]);
+   b[46] = (c6[5] *= a[85]);
+   b[47] = (c7[5] *= a[85]);
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+   VbS6 = vec_splat(Vb[11], 2);
+   VbS7 = vec_splat(Vb[11], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]);
+   c0[4] -= b[40] * a[84];
+   c1[4] -= b[41] * a[84];
+   c2[4] -= b[42] * a[84];
+   c3[4] -= b[43] * a[84];
+   c4[4] -= b[44] * a[84];
+   c5[4] -= b[45] * a[84];
+   c6[4] -= b[46] * a[84];
+   c7[4] -= b[47] * a[84];
+
+   b[32] = (c0[4] *= a[68]);
+   b[33] = (c1[4] *= a[68]);
+   b[34] = (c2[4] *= a[68]);
+   b[35] = (c3[4] *= a[68]);
+   b[36] = (c4[4] *= a[68]);
+   b[37] = (c5[4] *= a[68]);
+   b[38] = (c6[4] *= a[68]);
+   b[39] = (c7[4] *= a[68]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+   VbS5 = vec_splat(Vb[9], 1);
+   VbS6 = vec_splat(Vb[9], 2);
+   VbS7 = vec_splat(Vb[9], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]);
+
+   b[24] = (c0[3] *= a[51]);
+   b[25] = (c1[3] *= a[51]);
+   b[26] = (c2[3] *= a[51]);
+   b[27] = (c3[3] *= a[51]);
+   b[28] = (c4[3] *= a[51]);
+   b[29] = (c5[3] *= a[51]);
+   b[30] = (c6[3] *= a[51]);
+   b[31] = (c7[3] *= a[51]);
+   c0[0] -= b[24] * a[48];
+   c0[1] -= b[24] * a[49];
+   c0[2] -= b[24] * a[50];
+   c1[0] -= b[25] * a[48];
+   c1[1] -= b[25] * a[49];
+   c1[2] -= b[25] * a[50];
+   c2[0] -= b[26] * a[48];
+   c2[1] -= b[26] * a[49];
+   c2[2] -= b[26] * a[50];
+   c3[0] -= b[27] * a[48];
+   c3[1] -= b[27] * a[49];
+   c3[2] -= b[27] * a[50];
+   c4[0] -= b[28] * a[48];
+   c4[1] -= b[28] * a[49];
+   c4[2] -= b[28] * a[50];
+   c5[0] -= b[29] * a[48];
+   c5[1] -= b[29] * a[49];
+   c5[2] -= b[29] * a[50];
+   c6[0] -= b[30] * a[48];
+   c6[1] -= b[30] * a[49];
+   c6[2] -= b[30] * a[50];
+   c7[0] -= b[31] * a[48];
+   c7[1] -= b[31] * a[49];
+   c7[2] -= b[31] * a[50];
+
+   b[16] = (c0[2] *= a[34]);
+   b[17] = (c1[2] *= a[34]);
+   b[18] = (c2[2] *= a[34]);
+   b[19] = (c3[2] *= a[34]);
+   b[20] = (c4[2] *= a[34]);
+   b[21] = (c5[2] *= a[34]);
+   b[22] = (c6[2] *= a[34]);
+   b[23] = (c7[2] *= a[34]);
+   c0[0] -= b[16] * a[32];
+   c0[1] -= b[16] * a[33];
+   c1[0] -= b[17] * a[32];
+   c1[1] -= b[17] * a[33];
+   c2[0] -= b[18] * a[32];
+   c2[1] -= b[18] * a[33];
+   c3[0] -= b[19] * a[32];
+   c3[1] -= b[19] * a[33];
+   c4[0] -= b[20] * a[32];
+   c4[1] -= b[20] * a[33];
+   c5[0] -= b[21] * a[32];
+   c5[1] -= b[21] * a[33];
+   c6[0] -= b[22] * a[32];
+   c6[1] -= b[22] * a[33];
+   c7[0] -= b[23] * a[32];
+   c7[1] -= b[23] * a[33];
+
+   b[ 8] = (c0[1] *= a[17]);
+   b[ 9] = (c1[1] *= a[17]);
+   b[10] = (c2[1] *= a[17]);
+   b[11] = (c3[1] *= a[17]);
+   b[12] = (c4[1] *= a[17]);
+   b[13] = (c5[1] *= a[17]);
+   b[14] = (c6[1] *= a[17]);
+   b[15] = (c7[1] *= a[17]);
+   c0[0] -= b[ 8] * a[16];
+   c1[0] -= b[ 9] * a[16];
+   c2[0] -= b[10] * a[16];
+   c3[0] -= b[11] * a[16];
+   c4[0] -= b[12] * a[16];
+   c5[0] -= b[13] * a[16];
+   c6[0] -= b[14] * a[16];
+   c7[0] -= b[15] * a[16];
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      for (i = 1; i < GEMM_UNROLL_M; i *= 2){
+	if (m & i) {
+	  aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+	  cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + i             * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(i, GEMM_UNROLL_N,
+		aa + (kk - i) * i             * COMPSIZE,
+		b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  kk -= i;
+	}
+      }
+    }
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+    if (i > 0) {
+      aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+      cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+      do {
+	if (k - kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa + GEMM_UNROLL_M * kk * COMPSIZE,
+		      b +  GEMM_UNROLL_N * kk * COMPSIZE,
+		      cc,
+		      ldc);
+	}
+
+	if (well_aligned) {
+#ifdef DOUBLE
+	  solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	           b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	           b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	}
+	else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	      b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+	}
+
+	aa -= GEMM_UNROLL_M * k * COMPSIZE;
+	cc -= GEMM_UNROLL_M     * COMPSIZE;
+	kk -= GEMM_UNROLL_M;
+	i --;
+      } while (i > 0);
+    }
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	kk = m + offset;
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  for (i = 1; i < GEMM_UNROLL_M; i *= 2){
+	    if (m & i) {
+	      aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+	      cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+	      if (k - kk > 0) {
+		GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa + i * kk * COMPSIZE,
+			    b  + j * kk * COMPSIZE,
+			    cc, ldc);
+	      }
+
+	      solve(i, j,
+		    aa + (kk - i) * i * COMPSIZE,
+		    b  + (kk - i) * j * COMPSIZE,
+		    cc, ldc);
+
+	      kk -= i;
+	    }
+	  }
+	}
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+	if (i > 0) {
+	  aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+	  cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+	  do {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			  b +  j             * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(GEMM_UNROLL_M, j,
+		  aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_M) * j             * COMPSIZE,
+		  cc, ldc);
+
+	    aa -= GEMM_UNROLL_M * k * COMPSIZE;
+	    cc -= GEMM_UNROLL_M     * COMPSIZE;
+	    kk -= GEMM_UNROLL_M;
+	    i --;
+	  } while (i > 0);
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c
new file mode 100644
index 000000000..51f3a4e61
--- /dev/null
+++ b/kernel/power/trsm_kernel_LT_power10.c
@@ -0,0 +1,1264 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[1], 0);
+   VbS3 = vec_splat(Vb[1], 1);
+   VbS4 = vec_splat(Vb[2], 0);
+   VbS5 = vec_splat(Vb[2], 1);
+   VbS6 = vec_splat(Vb[3], 0);
+   VbS7 = vec_splat(Vb[3], 1);
+   Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+   c0[1] -= c0[0] * a[1];
+   c1[1] -= c1[0] * a[1];
+   c2[1] -= c2[0] * a[1];
+   c3[1] -= c3[0] * a[1];
+   c4[1] -= c4[0] * a[1];
+   c5[1] -= c5[0] * a[1];
+   c6[1] -= c6[0] * a[1];
+   c7[1] -= c7[0] * a[1];
+
+   b[ 8] = (c0[1] *= a[9]);
+   b[ 9] = (c1[1] *= a[9]);
+   b[10] = (c2[1] *= a[9]);
+   b[11] = (c3[1] *= a[9]);
+   b[12] = (c4[1] *= a[9]);
+   b[13] = (c5[1] *= a[9]);
+   b[14] = (c6[1] *= a[9]);
+   b[15] = (c7[1] *= a[9]);
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[5], 0);
+   VbS3 = vec_splat(Vb[5], 1);
+   VbS4 = vec_splat(Vb[6], 0);
+   VbS5 = vec_splat(Vb[6], 1);
+   VbS6 = vec_splat(Vb[7], 0);
+   VbS7 = vec_splat(Vb[7], 1);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]);
+
+   b[16] = (c0[2] *= a[18]);
+   b[17] = (c1[2] *= a[18]);
+   b[18] = (c2[2] *= a[18]);
+   b[19] = (c3[2] *= a[18]);
+   b[20] = (c4[2] *= a[18]);
+   b[21] = (c5[2] *= a[18]);
+   b[22] = (c6[2] *= a[18]);
+   b[23] = (c7[2] *= a[18]);
+   VbS0 = vec_splat(Vb[ 8], 0);
+   VbS1 = vec_splat(Vb[ 8], 1);
+   VbS2 = vec_splat(Vb[ 9], 0);
+   VbS3 = vec_splat(Vb[ 9], 1);
+   VbS4 = vec_splat(Vb[10], 0);
+   VbS5 = vec_splat(Vb[10], 1);
+   VbS6 = vec_splat(Vb[11], 0);
+   VbS7 = vec_splat(Vb[11], 1);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]);
+   c0[3] -= c0[2] * a[19];
+   c1[3] -= c1[2] * a[19];
+   c2[3] -= c2[2] * a[19];
+   c3[3] -= c3[2] * a[19];
+   c4[3] -= c4[2] * a[19];
+   c5[3] -= c5[2] * a[19];
+   c6[3] -= c6[2] * a[19];
+   c7[3] -= c7[2] * a[19];
+
+   b[24] = (c0[3] *= a[27]);
+   b[25] = (c1[3] *= a[27]);
+   b[26] = (c2[3] *= a[27]);
+   b[27] = (c3[3] *= a[27]);
+   b[28] = (c4[3] *= a[27]);
+   b[29] = (c5[3] *= a[27]);
+   b[30] = (c6[3] *= a[27]);
+   b[31] = (c7[3] *= a[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   VbS3 = vec_splat(Vb[13], 1);
+   VbS4 = vec_splat(Vb[14], 0);
+   VbS5 = vec_splat(Vb[14], 1);
+   VbS6 = vec_splat(Vb[15], 0);
+   VbS7 = vec_splat(Vb[15], 1);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]);
+
+   b[32] = (c0[4] *= a[36]);
+   b[33] = (c1[4] *= a[36]);
+   b[34] = (c2[4] *= a[36]);
+   b[35] = (c3[4] *= a[36]);
+   b[36] = (c4[4] *= a[36]);
+   b[37] = (c5[4] *= a[36]);
+   b[38] = (c6[4] *= a[36]);
+   b[39] = (c7[4] *= a[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   VbS4 = vec_splat(Vb[18], 0);
+   VbS5 = vec_splat(Vb[18], 1);
+   VbS6 = vec_splat(Vb[19], 0);
+   VbS7 = vec_splat(Vb[19], 1);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]);
+   c0[5] -= c0[4] * a[37];
+   c1[5] -= c1[4] * a[37];
+   c2[5] -= c2[4] * a[37];
+   c3[5] -= c3[4] * a[37];
+   c4[5] -= c4[4] * a[37];
+   c5[5] -= c5[4] * a[37];
+   c6[5] -= c6[4] * a[37];
+   c7[5] -= c7[4] * a[37];
+
+   b[40] = (c0[5] *= a[45]);
+   b[41] = (c1[5] *= a[45]);
+   b[42] = (c2[5] *= a[45]);
+   b[43] = (c3[5] *= a[45]);
+   b[44] = (c4[5] *= a[45]);
+   b[45] = (c5[5] *= a[45]);
+   b[46] = (c6[5] *= a[45]);
+   b[47] = (c7[5] *= a[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   VbS5 = vec_splat(Vb[22], 1);
+   VbS6 = vec_splat(Vb[23], 0);
+   VbS7 = vec_splat(Vb[23], 1);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]);
+
+   b[48] = (c0[6] *= a[54]);
+   b[49] = (c1[6] *= a[54]);
+   b[50] = (c2[6] *= a[54]);
+   b[51] = (c3[6] *= a[54]);
+   b[52] = (c4[6] *= a[54]);
+   b[53] = (c5[6] *= a[54]);
+   b[54] = (c6[6] *= a[54]);
+   b[55] = (c7[6] *= a[54]);
+   c0[7] -= c0[6] * a[55];
+   c1[7] -= c1[6] * a[55];
+   c2[7] -= c2[6] * a[55];
+   c3[7] -= c3[6] * a[55];
+   c4[7] -= c4[6] * a[55];
+   c5[7] -= c5[6] * a[55];
+   c6[7] -= c6[6] * a[55];
+   c7[7] -= c7[6] * a[55];
+
+   b[56] = (c0[7] *= a[63]);
+   b[57] = (c1[7] *= a[63]);
+   b[58] = (c2[7] *= a[63]);
+   b[59] = (c3[7] *= a[63]);
+   b[60] = (c4[7] *= a[63]);
+   b[61] = (c5[7] *= a[63]);
+   b[62] = (c6[7] *= a[63]);
+   b[63] = (c7[7] *= a[63]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[0], 2);
+   VbS3 = vec_splat(Vb[0], 3);
+   VbS4 = vec_splat(Vb[1], 0);
+   VbS5 = vec_splat(Vb[1], 1);
+   VbS6 = vec_splat(Vb[1], 2);
+   VbS7 = vec_splat(Vb[1], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+   c0[1] -= b[0] * a[ 1];
+   c0[2] -= b[0] * a[ 2];
+   c0[3] -= b[0] * a[ 3];
+   c1[1] -= b[1] * a[ 1];
+   c1[2] -= b[1] * a[ 2];
+   c1[3] -= b[1] * a[ 3];
+   c2[1] -= b[2] * a[ 1];
+   c2[2] -= b[2] * a[ 2];
+   c2[3] -= b[2] * a[ 3];
+   c3[1] -= b[3] * a[ 1];
+   c3[2] -= b[3] * a[ 2];
+   c3[3] -= b[3] * a[ 3];
+   c4[1] -= b[4] * a[ 1];
+   c4[2] -= b[4] * a[ 2];
+   c4[3] -= b[4] * a[ 3];
+   c5[1] -= b[5] * a[ 1];
+   c5[2] -= b[5] * a[ 2];
+   c5[3] -= b[5] * a[ 3];
+   c6[1] -= b[6] * a[ 1];
+   c6[2] -= b[6] * a[ 2];
+   c6[3] -= b[6] * a[ 3];
+   c7[1] -= b[7] * a[ 1];
+   c7[2] -= b[7] * a[ 2];
+   c7[3] -= b[7] * a[ 3];
+ 
+   b[ 8] = (c0[1] *= a[17]);
+   b[ 9] = (c1[1] *= a[17]);
+   b[10] = (c2[1] *= a[17]);
+   b[11] = (c3[1] *= a[17]);
+   b[12] = (c4[1] *= a[17]);
+   b[13] = (c5[1] *= a[17]);
+   b[14] = (c6[1] *= a[17]);
+   b[15] = (c7[1] *= a[17]);
+   VbS0 = vec_splat(Vb[2], 0);
+   VbS1 = vec_splat(Vb[2], 1);
+   VbS2 = vec_splat(Vb[2], 2);
+   VbS3 = vec_splat(Vb[2], 3);
+   VbS4 = vec_splat(Vb[3], 0);
+   VbS5 = vec_splat(Vb[3], 1);
+   VbS6 = vec_splat(Vb[3], 2);
+   VbS7 = vec_splat(Vb[3], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]);
+   c0[2] -= b[ 8] * a[18];
+   c0[3] -= b[ 8] * a[19];
+   c1[2] -= b[ 9] * a[18];
+   c1[3] -= b[ 9] * a[19];
+   c2[2] -= b[10] * a[18];
+   c2[3] -= b[10] * a[19];
+   c3[2] -= b[11] * a[18];
+   c3[3] -= b[11] * a[19];
+   c4[2] -= b[12] * a[18];
+   c4[3] -= b[12] * a[19];
+   c5[2] -= b[13] * a[18];
+   c5[3] -= b[13] * a[19];
+   c6[2] -= b[14] * a[18];
+   c6[3] -= b[14] * a[19];
+   c7[2] -= b[15] * a[18];
+   c7[3] -= b[15] * a[19];
+
+   b[16] = (c0[2] *= a[34]);
+   b[17] = (c1[2] *= a[34]);
+   b[18] = (c2[2] *= a[34]);
+   b[19] = (c3[2] *= a[34]);
+   b[20] = (c4[2] *= a[34]);
+   b[21] = (c5[2] *= a[34]);
+   b[22] = (c6[2] *= a[34]);
+   b[23] = (c7[2] *= a[34]);
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[4], 2);
+   VbS3 = vec_splat(Vb[4], 3);
+   VbS4 = vec_splat(Vb[5], 0);
+   VbS5 = vec_splat(Vb[5], 1);
+   VbS6 = vec_splat(Vb[5], 2);
+   VbS7 = vec_splat(Vb[5], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]);
+   c0[3] -= b[16] * a[35];
+   c1[3] -= b[17] * a[35];
+   c2[3] -= b[18] * a[35];
+   c3[3] -= b[19] * a[35];
+   c4[3] -= b[20] * a[35];
+   c5[3] -= b[21] * a[35];
+   c6[3] -= b[22] * a[35];
+   c7[3] -= b[23] * a[35];
+
+   b[24] = (c0[3] *= a[51]);
+   b[25] = (c1[3] *= a[51]);
+   b[26] = (c2[3] *= a[51]);
+   b[27] = (c3[3] *= a[51]);
+   b[28] = (c4[3] *= a[51]);
+   b[29] = (c5[3] *= a[51]);
+   b[30] = (c6[3] *= a[51]);
+   b[31] = (c7[3] *= a[51]);
+   VbS0 = vec_splat(Vb[6], 0);
+   VbS1 = vec_splat(Vb[6], 1);
+   VbS2 = vec_splat(Vb[6], 2);
+   VbS3 = vec_splat(Vb[6], 3);
+   VbS4 = vec_splat(Vb[7], 0);
+   VbS5 = vec_splat(Vb[7], 1);
+   VbS6 = vec_splat(Vb[7], 2);
+   VbS7 = vec_splat(Vb[7], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]);
+
+   b[32] = (c0[4] *= a[68]);
+   b[33] = (c1[4] *= a[68]);
+   b[34] = (c2[4] *= a[68]);
+   b[35] = (c3[4] *= a[68]);
+   b[36] = (c4[4] *= a[68]);
+   b[37] = (c5[4] *= a[68]);
+   b[38] = (c6[4] *= a[68]);
+   b[39] = (c7[4] *= a[68]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+   VbS5 = vec_splat(Vb[9], 1);
+   VbS6 = vec_splat(Vb[9], 2);
+   VbS7 = vec_splat(Vb[9], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]);
+   c0[5] -= b[32] * a[69];
+   c0[6] -= b[32] * a[70];
+   c0[7] -= b[32] * a[71];
+   c1[5] -= b[33] * a[69];
+   c1[6] -= b[33] * a[70];
+   c1[7] -= b[33] * a[71];
+   c2[5] -= b[34] * a[69];
+   c2[6] -= b[34] * a[70];
+   c2[7] -= b[34] * a[71];
+   c3[5] -= b[35] * a[69];
+   c3[6] -= b[35] * a[70];
+   c3[7] -= b[35] * a[71];
+   c4[5] -= b[36] * a[69];
+   c4[6] -= b[36] * a[70];
+   c4[7] -= b[36] * a[71];
+   c5[5] -= b[37] * a[69];
+   c5[6] -= b[37] * a[70];
+   c5[7] -= b[37] * a[71];
+   c6[5] -= b[38] * a[69];
+   c6[6] -= b[38] * a[70];
+   c6[7] -= b[38] * a[71];
+   c7[5] -= b[39] * a[69];
+   c7[6] -= b[39] * a[70];
+   c7[7] -= b[39] * a[71];
+
+   b[40] = (c0[5] *= a[85]);
+   b[41] = (c1[5] *= a[85]);
+   b[42] = (c2[5] *= a[85]);
+   b[43] = (c3[5] *= a[85]);
+   b[44] = (c4[5] *= a[85]);
+   b[45] = (c5[5] *= a[85]);
+   b[46] = (c6[5] *= a[85]);
+   b[47] = (c7[5] *= a[85]);
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+   VbS6 = vec_splat(Vb[11], 2);
+   VbS7 = vec_splat(Vb[11], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]);
+   c0[6] -= b[40] * a[86];
+   c0[7] -= b[40] * a[87];
+   c1[6] -= b[41] * a[86];
+   c1[7] -= b[41] * a[87];
+   c2[6] -= b[42] * a[86];
+   c2[7] -= b[42] * a[87];
+   c3[6] -= b[43] * a[86];
+   c3[7] -= b[43] * a[87];
+   c4[6] -= b[44] * a[86];
+   c4[7] -= b[44] * a[87];
+   c5[6] -= b[45] * a[86];
+   c5[7] -= b[45] * a[87];
+   c6[6] -= b[46] * a[86];
+   c6[7] -= b[46] * a[87];
+   c7[6] -= b[47] * a[86];
+   c7[7] -= b[47] * a[87];
+
+   b[48] = (c0[6] *= a[102]);
+   b[49] = (c1[6] *= a[102]);
+   b[50] = (c2[6] *= a[102]);
+   b[51] = (c3[6] *= a[102]);
+   b[52] = (c4[6] *= a[102]);
+   b[53] = (c5[6] *= a[102]);
+   b[54] = (c6[6] *= a[102]);
+   b[55] = (c7[6] *= a[102]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+   VbS7 = vec_splat(Vb[13], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]);
+   c0[7] -= b[48] * a[103];
+   c1[7] -= b[49] * a[103];
+   c2[7] -= b[50] * a[103];
+   c3[7] -= b[51] * a[103];
+   c4[7] -= b[52] * a[103];
+   c5[7] -= b[53] * a[103];
+   c6[7] -= b[54] * a[103];
+   c7[7] -= b[55] * a[103];
+
+   b[56] = (c0[7] *= a[119]);
+   b[57] = (c1[7] *= a[119]);
+   b[58] = (c2[7] *= a[119]);
+   b[59] = (c3[7] *= a[119]);
+   b[60] = (c4[7] *= a[119]);
+   b[61] = (c5[7] *= a[119]);
+   b[62] = (c6[7] *= a[119]);
+   b[63] = (c7[7] *= a[119]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]);
+
+   b[64] = (c0[8] *= a[136]);
+   b[65] = (c1[8] *= a[136]);
+   b[66] = (c2[8] *= a[136]);
+   b[67] = (c3[8] *= a[136]);
+   b[68] = (c4[8] *= a[136]);
+   b[69] = (c5[8] *= a[136]);
+   b[70] = (c6[8] *= a[136]);
+   b[71] = (c7[8] *= a[136]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[16], 2);
+   VbS3 = vec_splat(Vb[16], 3);
+   VbS4 = vec_splat(Vb[17], 0);
+   VbS5 = vec_splat(Vb[17], 1);
+   VbS6 = vec_splat(Vb[17], 2);
+   VbS7 = vec_splat(Vb[17], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]);
+   c0[ 9] -= b[64] * a[137];
+   c0[10] -= b[64] * a[138];
+   c0[11] -= b[64] * a[139];
+   c1[ 9] -= b[65] * a[137];
+   c1[10] -= b[65] * a[138];
+   c1[11] -= b[65] * a[139];
+   c2[ 9] -= b[66] * a[137];
+   c2[10] -= b[66] * a[138];
+   c2[11] -= b[66] * a[139];
+   c3[ 9] -= b[67] * a[137];
+   c3[10] -= b[67] * a[138];
+   c3[11] -= b[67] * a[139];
+   c4[ 9] -= b[68] * a[137];
+   c4[10] -= b[68] * a[138];
+   c4[11] -= b[68] * a[139];
+   c5[ 9] -= b[69] * a[137];
+   c5[10] -= b[69] * a[138];
+   c5[11] -= b[69] * a[139];
+   c6[ 9] -= b[70] * a[137];
+   c6[10] -= b[70] * a[138];
+   c6[11] -= b[70] * a[139];
+   c7[ 9] -= b[71] * a[137];
+   c7[10] -= b[71] * a[138];
+   c7[11] -= b[71] * a[139];
+
+   b[72] = (c0[9] *= a[153]);
+   b[73] = (c1[9] *= a[153]);
+   b[74] = (c2[9] *= a[153]);
+   b[75] = (c3[9] *= a[153]);
+   b[76] = (c4[9] *= a[153]);
+   b[77] = (c5[9] *= a[153]);
+   b[78] = (c6[9] *= a[153]);
+   b[79] = (c7[9] *= a[153]);
+   VbS0 = vec_splat(Vb[18], 0);
+   VbS1 = vec_splat(Vb[18], 1);
+   VbS2 = vec_splat(Vb[18], 2);
+   VbS3 = vec_splat(Vb[18], 3);
+   VbS4 = vec_splat(Vb[19], 0);
+   VbS5 = vec_splat(Vb[19], 1);
+   VbS6 = vec_splat(Vb[19], 2);
+   VbS7 = vec_splat(Vb[19], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]);
+   c0[10] -= b[72] * a[154];
+   c0[11] -= b[72] * a[155];
+   c1[10] -= b[73] * a[154];
+   c1[11] -= b[73] * a[155];
+   c2[10] -= b[74] * a[154];
+   c2[11] -= b[74] * a[155];
+   c3[10] -= b[75] * a[154];
+   c3[11] -= b[75] * a[155];
+   c4[10] -= b[76] * a[154];
+   c4[11] -= b[76] * a[155];
+   c5[10] -= b[77] * a[154];
+   c5[11] -= b[77] * a[155];
+   c6[10] -= b[78] * a[154];
+   c6[11] -= b[78] * a[155];
+   c7[10] -= b[79] * a[154];
+   c7[11] -= b[79] * a[155];
+
+   b[80] = (c0[10] *= a[170]);
+   b[81] = (c1[10] *= a[170]);
+   b[82] = (c2[10] *= a[170]);
+   b[83] = (c3[10] *= a[170]);
+   b[84] = (c4[10] *= a[170]);
+   b[85] = (c5[10] *= a[170]);
+   b[86] = (c6[10] *= a[170]);
+   b[87] = (c7[10] *= a[170]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[20], 2);
+   VbS3 = vec_splat(Vb[20], 3);
+   VbS4 = vec_splat(Vb[21], 0);
+   VbS5 = vec_splat(Vb[21], 1);
+   VbS6 = vec_splat(Vb[21], 2);
+   VbS7 = vec_splat(Vb[21], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]);
+   c0[11] -= b[80] * a[171];
+   c1[11] -= b[81] * a[171];
+   c2[11] -= b[82] * a[171];
+   c3[11] -= b[83] * a[171];
+   c4[11] -= b[84] * a[171];
+   c5[11] -= b[85] * a[171];
+   c6[11] -= b[86] * a[171];
+   c7[11] -= b[87] * a[171];
+
+   b[88] = (c0[11] *= a[187]);
+   b[89] = (c1[11] *= a[187]);
+   b[90] = (c2[11] *= a[187]);
+   b[91] = (c3[11] *= a[187]);
+   b[92] = (c4[11] *= a[187]);
+   b[93] = (c5[11] *= a[187]);
+   b[94] = (c6[11] *= a[187]);
+   b[95] = (c7[11] *= a[187]);
+   VbS0 = vec_splat(Vb[22], 0);
+   VbS1 = vec_splat(Vb[22], 1);
+   VbS2 = vec_splat(Vb[22], 2);
+   VbS3 = vec_splat(Vb[22], 3);
+   VbS4 = vec_splat(Vb[23], 0);
+   VbS5 = vec_splat(Vb[23], 1);
+   VbS6 = vec_splat(Vb[23], 2);
+   VbS7 = vec_splat(Vb[23], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]);
+
+   b[ 96] = (c0[12] *= a[204]);
+   b[ 97] = (c1[12] *= a[204]);
+   b[ 98] = (c2[12] *= a[204]);
+   b[ 99] = (c3[12] *= a[204]);
+   b[100] = (c4[12] *= a[204]);
+   b[101] = (c5[12] *= a[204]);
+   b[102] = (c6[12] *= a[204]);
+   b[103] = (c7[12] *= a[204]);
+   c0[13] -= b[ 96] * a[205];
+   c0[14] -= b[ 96] * a[206];
+   c0[15] -= b[ 96] * a[207];
+   c1[13] -= b[ 97] * a[205];
+   c1[14] -= b[ 97] * a[206];
+   c1[15] -= b[ 97] * a[207];
+   c2[13] -= b[ 98] * a[205];
+   c2[14] -= b[ 98] * a[206];
+   c2[15] -= b[ 98] * a[207];
+   c3[13] -= b[ 99] * a[205];
+   c3[14] -= b[ 99] * a[206];
+   c3[15] -= b[ 99] * a[207];
+   c4[13] -= b[100] * a[205];
+   c4[14] -= b[100] * a[206];
+   c4[15] -= b[100] * a[207];
+   c5[13] -= b[101] * a[205];
+   c5[14] -= b[101] * a[206];
+   c5[15] -= b[101] * a[207];
+   c6[13] -= b[102] * a[205];
+   c6[14] -= b[102] * a[206];
+   c6[15] -= b[102] * a[207];
+   c7[13] -= b[103] * a[205];
+   c7[14] -= b[103] * a[206];
+   c7[15] -= b[103] * a[207];
+
+   b[104] = (c0[13] *= a[221]);
+   b[105] = (c1[13] *= a[221]);
+   b[106] = (c2[13] *= a[221]);
+   b[107] = (c3[13] *= a[221]);
+   b[108] = (c4[13] *= a[221]);
+   b[109] = (c5[13] *= a[221]);
+   b[110] = (c6[13] *= a[221]);
+   b[111] = (c7[13] *= a[221]);
+   c0[14] -= b[104] * a[222];
+   c0[15] -= b[104] * a[223];
+   c1[14] -= b[105] * a[222];
+   c1[15] -= b[105] * a[223];
+   c2[14] -= b[106] * a[222];
+   c2[15] -= b[106] * a[223];
+   c3[14] -= b[107] * a[222];
+   c3[15] -= b[107] * a[223];
+   c4[14] -= b[108] * a[222];
+   c4[15] -= b[108] * a[223];
+   c5[14] -= b[109] * a[222];
+   c5[15] -= b[109] * a[223];
+   c6[14] -= b[110] * a[222];
+   c6[15] -= b[110] * a[223];
+   c7[14] -= b[111] * a[222];
+   c7[15] -= b[111] * a[223];
+
+   b[112] = (c0[14] *= a[238]);
+   b[113] = (c1[14] *= a[238]);
+   b[114] = (c2[14] *= a[238]);
+   b[115] = (c3[14] *= a[238]);
+   b[116] = (c4[14] *= a[238]);
+   b[117] = (c5[14] *= a[238]);
+   b[118] = (c6[14] *= a[238]);
+   b[119] = (c7[14] *= a[238]);
+   c0[15] -= b[112] * a[239];
+   c1[15] -= b[113] * a[239];
+   c2[15] -= b[114] * a[239];
+   c3[15] -= b[115] * a[239];
+   c4[15] -= b[116] * a[239];
+   c5[15] -= b[117] * a[239];
+   c6[15] -= b[118] * a[239];
+   c7[15] -= b[119] * a[239];
+
+   b[120] = (c0[15] *= a[255]);
+   b[121] = (c1[15] *= a[255]);
+   b[122] = (c2[15] *= a[255]);
+   b[123] = (c3[15] *= a[255]);
+   b[124] = (c4[15] *= a[255]);
+   b[125] = (c5[15] *= a[255]);
+   b[126] = (c6[15] *= a[255]);
+   b[127] = (c7[15] *= a[255]);
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+
+    while (i > 0) {
+
+	if (kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+      if (well_aligned) {
+#ifdef DOUBLE
+	solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		 b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		  b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+      }
+      else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + kk * GEMM_UNROLL_M * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+      }
+
+      aa += GEMM_UNROLL_M * k * COMPSIZE;
+      cc += GEMM_UNROLL_M     * COMPSIZE;
+      kk += GEMM_UNROLL_M;
+      i --;
+    }
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      i = (GEMM_UNROLL_M >> 1);
+      while (i > 0) {
+	if (m & i) {
+	    if (kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa, b, cc, ldc);
+	    }
+	  solve(i, GEMM_UNROLL_N,
+		aa + kk * i             * COMPSIZE,
+		b  + kk * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += i * k * COMPSIZE;
+	  cc += i     * COMPSIZE;
+	  kk += i;
+	}
+	i >>= 1;
+      }
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += GEMM_UNROLL_M;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	kk = offset;
+	aa = a;
+	cc = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+
+	while (i > 0) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, j,
+		aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  kk += GEMM_UNROLL_M;
+	  i --;
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  while (i > 0) {
+	    if (m & i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+	      kk += i;
+	      }
+	    i >>= 1;
+	  }
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c
new file mode 100644
index 000000000..92c26fcc3
--- /dev/null
+++ b/kernel/power/trsm_kernel_RN_power10.c
@@ -0,0 +1,828 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
+
+   a[0] = (c0[0] *= b[0]);
+   a[1] = (c0[1] *= b[0]);
+   a[2] = (c0[2] *= b[0]);
+   a[3] = (c0[3] *= b[0]);
+   a[4] = (c0[4] *= b[0]);
+   a[5] = (c0[5] *= b[0]);
+   a[6] = (c0[6] *= b[0]);
+   a[7] = (c0[7] *= b[0]);
+   VbS0 = vec_splat(Vb[0], 1);
+   VbS1 = vec_splat(Vb[1], 0);
+   VbS2 = vec_splat(Vb[1], 1);
+   VbS3 = vec_splat(Vb[2], 0);
+   VbS4 = vec_splat(Vb[2], 1);
+   VbS5 = vec_splat(Vb[3], 0);
+   VbS6 = vec_splat(Vb[3], 1);
+   Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]);
+
+   a[ 8] = (c1[0] *= b[9]);
+   a[ 9] = (c1[1] *= b[9]);
+   a[10] = (c1[2] *= b[9]);
+   a[11] = (c1[3] *= b[9]);
+   a[12] = (c1[4] *= b[9]);
+   a[13] = (c1[5] *= b[9]);
+   a[14] = (c1[6] *= b[9]);
+   a[15] = (c1[7] *= b[9]);
+   VbS0 = vec_splat(Vb[5], 0);
+   VbS1 = vec_splat(Vb[5], 1);
+   VbS2 = vec_splat(Vb[6], 0);
+   VbS3 = vec_splat(Vb[6], 1);
+   VbS4 = vec_splat(Vb[7], 0);
+   VbS5 = vec_splat(Vb[7], 1);
+   Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]);
+
+   a[16] = (c2[0] *= b[18]);
+   a[17] = (c2[1] *= b[18]);
+   a[18] = (c2[2] *= b[18]);
+   a[19] = (c2[3] *= b[18]);
+   a[20] = (c2[4] *= b[18]);
+   a[21] = (c2[5] *= b[18]);
+   a[22] = (c2[6] *= b[18]);
+   a[23] = (c2[7] *= b[18]);
+   VbS0 = vec_splat(Vb[ 9], 1);
+   VbS1 = vec_splat(Vb[10], 0);
+   VbS2 = vec_splat(Vb[10], 1);
+   VbS3 = vec_splat(Vb[11], 0);
+   VbS4 = vec_splat(Vb[11], 1);
+   Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]);
+
+   a[24] = (c3[0] *= b[27]);
+   a[25] = (c3[1] *= b[27]);
+   a[26] = (c3[2] *= b[27]);
+   a[27] = (c3[3] *= b[27]);
+   a[28] = (c3[4] *= b[27]);
+   a[29] = (c3[5] *= b[27]);
+   a[30] = (c3[6] *= b[27]);
+   a[31] = (c3[7] *= b[27]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[15], 0);
+   VbS3 = vec_splat(Vb[15], 1);
+   Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]);
+
+   a[32] = (c4[0] *= b[36]);
+   a[33] = (c4[1] *= b[36]);
+   a[34] = (c4[2] *= b[36]);
+   a[35] = (c4[3] *= b[36]);
+   a[36] = (c4[4] *= b[36]);
+   a[37] = (c4[5] *= b[36]);
+   a[38] = (c4[6] *= b[36]);
+   a[39] = (c4[7] *= b[36]);
+   VbS0 = vec_splat(Vb[18], 1);
+   VbS1 = vec_splat(Vb[19], 0);
+   VbS2 = vec_splat(Vb[19], 1);
+   Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]);
+
+   a[40] = (c5[0] *= b[45]);
+   a[41] = (c5[1] *= b[45]);
+   a[42] = (c5[2] *= b[45]);
+   a[43] = (c5[3] *= b[45]);
+   a[44] = (c5[4] *= b[45]);
+   a[45] = (c5[5] *= b[45]);
+   a[46] = (c5[6] *= b[45]);
+   a[47] = (c5[7] *= b[45]);
+   VbS0 = vec_splat(Vb[23], 0);
+   VbS1 = vec_splat(Vb[23], 1);
+   Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]);
+
+   a[48] = (c6[0] *= b[54]);
+   a[49] = (c6[1] *= b[54]);
+   a[50] = (c6[2] *= b[54]);
+   a[51] = (c6[3] *= b[54]);
+   a[52] = (c6[4] *= b[54]);
+   a[53] = (c6[5] *= b[54]);
+   a[54] = (c6[6] *= b[54]);
+   a[55] = (c6[7] *= b[54]);
+   VbS0 = vec_splat(Vb[27], 1);
+   Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]);
+
+   a[56] = (c7[0] *= b[63]);
+   a[57] = (c7[1] *= b[63]);
+   a[58] = (c7[2] *= b[63]);
+   a[59] = (c7[3] *= b[63]);
+   a[60] = (c7[4] *= b[63]);
+   a[61] = (c7[5] *= b[63]);
+   a[62] = (c7[6] *= b[63]);
+   a[63] = (c7[7] *= b[63]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[0], 2);
+   VbS3 = vec_splat(Vb[0], 3);
+   VbS4 = vec_splat(Vb[1], 0);
+   VbS5 = vec_splat(Vb[1], 1);
+   VbS6 = vec_splat(Vb[1], 2);
+   VbS7 = vec_splat(Vb[1], 3);
+   
+   Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]);
+   Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]);
+   Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]);
+   Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]);
+   Va[0] = Vc0[0];
+   Va[1] = Vc0[1];
+   Va[2] = Vc0[2];
+   Va[3] = Vc0[3];
+   Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[2], 1);
+   VbS1 = vec_splat(Vb[2], 2);
+   VbS2 = vec_splat(Vb[2], 3);
+   VbS3 = vec_splat(Vb[3], 0);
+   VbS4 = vec_splat(Vb[3], 1);
+   VbS5 = vec_splat(Vb[3], 2);
+   VbS6 = vec_splat(Vb[3], 3);
+   
+   Vc1[0] = vec_mul(VbS0, Vc1[0]);
+   Vc1[1] = vec_mul(VbS0, Vc1[1]);
+   Vc1[2] = vec_mul(VbS0, Vc1[2]);
+   Vc1[3] = vec_mul(VbS0, Vc1[3]);
+   Va[4] = Vc1[0];
+   Va[5] = Vc1[1];
+   Va[6] = Vc1[2];
+   Va[7] = Vc1[3];
+   Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[4], 2);
+   VbS1 = vec_splat(Vb[4], 3);
+   VbS2 = vec_splat(Vb[5], 0);
+   VbS3 = vec_splat(Vb[5], 1);
+   VbS4 = vec_splat(Vb[5], 2);
+   VbS5 = vec_splat(Vb[5], 3);
+   
+   Vc2[0] = vec_mul(VbS0, Vc2[0]);
+   Vc2[1] = vec_mul(VbS0, Vc2[1]);
+   Vc2[2] = vec_mul(VbS0, Vc2[2]);
+   Vc2[3] = vec_mul(VbS0, Vc2[3]);
+   Va[ 8] = Vc2[0];
+   Va[ 9] = Vc2[1];
+   Va[10] = Vc2[2];
+   Va[11] = Vc2[3];
+   Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[6], 3);
+   VbS1 = vec_splat(Vb[7], 0);
+   VbS2 = vec_splat(Vb[7], 1);
+   VbS3 = vec_splat(Vb[7], 2);
+   VbS4 = vec_splat(Vb[7], 3);
+   
+   Vc3[0] = vec_mul(VbS0, Vc3[0]);
+   Vc3[1] = vec_mul(VbS0, Vc3[1]);
+   Vc3[2] = vec_mul(VbS0, Vc3[2]);
+   Vc3[3] = vec_mul(VbS0, Vc3[3]);
+   Va[12] = Vc3[0];
+   Va[13] = Vc3[1];
+   Va[14] = Vc3[2];
+   Va[15] = Vc3[3];
+   Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[9], 0);
+   VbS1 = vec_splat(Vb[9], 1);
+   VbS2 = vec_splat(Vb[9], 2);
+   VbS3 = vec_splat(Vb[9], 3);
+   
+   Vc4[0] = vec_mul(VbS0, Vc4[0]);
+   Vc4[1] = vec_mul(VbS0, Vc4[1]);
+   Vc4[2] = vec_mul(VbS0, Vc4[2]);
+   Vc4[3] = vec_mul(VbS0, Vc4[3]);
+   Va[16] = Vc4[0];
+   Va[17] = Vc4[1];
+   Va[18] = Vc4[2];
+   Va[19] = Vc4[3];
+   Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[11], 1);
+   VbS1 = vec_splat(Vb[11], 2);
+   VbS2 = vec_splat(Vb[11], 3);
+   
+   Vc5[0] = vec_mul(VbS0, Vc5[0]);
+   Vc5[1] = vec_mul(VbS0, Vc5[1]);
+   Vc5[2] = vec_mul(VbS0, Vc5[2]);
+   Vc5[3] = vec_mul(VbS0, Vc5[3]);
+   Va[20] = Vc5[0];
+   Va[21] = Vc5[1];
+   Va[22] = Vc5[2];
+   Va[23] = Vc5[3];
+   Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[13], 2);
+   VbS1 = vec_splat(Vb[13], 3);
+   
+   Vc6[0] = vec_mul(VbS0, Vc6[0]);
+   Vc6[1] = vec_mul(VbS0, Vc6[1]);
+   Vc6[2] = vec_mul(VbS0, Vc6[2]);
+   Vc6[3] = vec_mul(VbS0, Vc6[3]);
+   Va[24] = Vc6[0];
+   Va[25] = Vc6[1];
+   Va[26] = Vc6[2];
+   Va[27] = Vc6[3];
+   Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[15], 3);
+   
+   Vc7[0] = vec_mul(VbS0, Vc7[0]);
+   Vc7[1] = vec_mul(VbS0, Vc7[1]);
+   Vc7[2] = vec_mul(VbS0, Vc7[2]);
+   Vc7[3] = vec_mul(VbS0, Vc7[3]);
+   Va[28] = Vc7[0];
+   Va[29] = Vc7[1];
+   Va[30] = Vc7[2];
+   Va[31] = Vc7[3];
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+
+#if 0
+  fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+
+    if (i > 0) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	if (well_aligned) {
+#ifdef DOUBLE
+	  solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		   b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		   b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	}
+	else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + kk * GEMM_UNROLL_M * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+	}
+
+	aa += GEMM_UNROLL_M * k * COMPSIZE;
+	cc += GEMM_UNROLL_M     * COMPSIZE;
+	i --;
+      } while (i > 0);
+    }
+
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      i = (GEMM_UNROLL_M >> 1);
+      while (i > 0) {
+	if (m & i) {
+	    if (kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa, b, cc, ldc);
+	    }
+	  solve(i, GEMM_UNROLL_N,
+		aa + kk * i             * COMPSIZE,
+		b  + kk * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += i * k * COMPSIZE;
+	  cc += i     * COMPSIZE;
+	}
+	i >>= 1;
+      }
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += GEMM_UNROLL_M;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+
+	while (i > 0) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, j,
+		aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i --;
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  while (i > 0) {
+	    if (m & i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+	      }
+	    i >>= 1;
+	  }
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c
new file mode 100644
index 000000000..529590f37
--- /dev/null
+++ b/kernel/power/trsm_kernel_RT_power10.c
@@ -0,0 +1,855 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
+
+   a[56] = (c7[0] *= b[63]);
+   a[57] = (c7[1] *= b[63]);
+   a[58] = (c7[2] *= b[63]);
+   a[59] = (c7[3] *= b[63]);
+   a[60] = (c7[4] *= b[63]);
+   a[61] = (c7[5] *= b[63]);
+   a[62] = (c7[6] *= b[63]);
+   a[63] = (c7[7] *= b[63]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[29], 0);
+   VbS3 = vec_splat(Vb[29], 1);
+   VbS4 = vec_splat(Vb[30], 0);
+   VbS5 = vec_splat(Vb[30], 1);
+   VbS6 = vec_splat(Vb[31], 0);
+   Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]);
+
+   a[48] = (c6[0] *= b[54]);
+   a[49] = (c6[1] *= b[54]);
+   a[50] = (c6[2] *= b[54]);
+   a[51] = (c6[3] *= b[54]);
+   a[52] = (c6[4] *= b[54]);
+   a[53] = (c6[5] *= b[54]);
+   a[54] = (c6[6] *= b[54]);
+   a[55] = (c6[7] *= b[54]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[25], 0);
+   VbS3 = vec_splat(Vb[25], 1);
+   VbS4 = vec_splat(Vb[26], 0);
+   VbS5 = vec_splat(Vb[26], 1);
+   Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]);
+
+   a[40] = (c5[0] *= b[45]);
+   a[41] = (c5[1] *= b[45]);
+   a[42] = (c5[2] *= b[45]);
+   a[43] = (c5[3] *= b[45]);
+   a[44] = (c5[4] *= b[45]);
+   a[45] = (c5[5] *= b[45]);
+   a[46] = (c5[6] *= b[45]);
+   a[47] = (c5[7] *= b[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]);
+
+   a[32] = (c4[0] *= b[36]);
+   a[33] = (c4[1] *= b[36]);
+   a[34] = (c4[2] *= b[36]);
+   a[35] = (c4[3] *= b[36]);
+   a[36] = (c4[4] *= b[36]);
+   a[37] = (c4[5] *= b[36]);
+   a[38] = (c4[6] *= b[36]);
+   a[39] = (c4[7] *= b[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]);
+
+   a[24] = (c3[0] *= b[27]);
+   a[25] = (c3[1] *= b[27]);
+   a[26] = (c3[2] *= b[27]);
+   a[27] = (c3[3] *= b[27]);
+   a[28] = (c3[4] *= b[27]);
+   a[29] = (c3[5] *= b[27]);
+   a[30] = (c3[6] *= b[27]);
+   a[31] = (c3[7] *= b[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]);
+
+   a[16] = (c2[0] *= b[18]);
+   a[17] = (c2[1] *= b[18]);
+   a[18] = (c2[2] *= b[18]);
+   a[19] = (c2[3] *= b[18]);
+   a[20] = (c2[4] *= b[18]);
+   a[21] = (c2[5] *= b[18]);
+   a[22] = (c2[6] *= b[18]);
+   a[23] = (c2[7] *= b[18]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]);
+
+   a[ 8] = (c1[0] *= b[9]);
+   a[ 9] = (c1[1] *= b[9]);
+   a[10] = (c1[2] *= b[9]);
+   a[11] = (c1[3] *= b[9]);
+   a[12] = (c1[4] *= b[9]);
+   a[13] = (c1[5] *= b[9]);
+   a[14] = (c1[6] *= b[9]);
+   a[15] = (c1[7] *= b[9]);
+   VbS0 = vec_splat(Vb[4], 0);
+   Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]);
+
+   a[0] = (c0[0] *= b[0]);
+   a[1] = (c0[1] *= b[0]);
+   a[2] = (c0[2] *= b[0]);
+   a[3] = (c0[3] *= b[0]);
+   a[4] = (c0[4] *= b[0]);
+   a[5] = (c0[5] *= b[0]);
+   a[6] = (c0[6] *= b[0]);
+   a[7] = (c0[7] *= b[0]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+
+   Vc7[0] = vec_mul(VbS7, Vc7[0]);
+   Vc7[1] = vec_mul(VbS7, Vc7[1]);
+   Vc7[2] = vec_mul(VbS7, Vc7[2]);
+   Vc7[3] = vec_mul(VbS7, Vc7[3]);
+   Va[28] = Vc7[0];
+   Va[29] = Vc7[1];
+   Va[30] = Vc7[2];
+   Va[31] = Vc7[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
+
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+
+   Vc6[0] = vec_mul(VbS6, Vc6[0]);
+   Vc6[1] = vec_mul(VbS6, Vc6[1]);
+   Vc6[2] = vec_mul(VbS6, Vc6[2]);
+   Vc6[3] = vec_mul(VbS6, Vc6[3]);
+   Va[24] = Vc6[0];
+   Va[25] = Vc6[1];
+   Va[26] = Vc6[2];
+   Va[27] = Vc6[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
+
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+
+   Vc5[0] = vec_mul(VbS5, Vc5[0]);
+   Vc5[1] = vec_mul(VbS5, Vc5[1]);
+   Vc5[2] = vec_mul(VbS5, Vc5[2]);
+   Vc5[3] = vec_mul(VbS5, Vc5[3]);
+   Va[20] = Vc5[0];
+   Va[21] = Vc5[1];
+   Va[22] = Vc5[2];
+   Va[23] = Vc5[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+
+   Vc4[0] = vec_mul(VbS4, Vc4[0]);
+   Vc4[1] = vec_mul(VbS4, Vc4[1]);
+   Vc4[2] = vec_mul(VbS4, Vc4[2]);
+   Vc4[3] = vec_mul(VbS4, Vc4[3]);
+   Va[16] = Vc4[0];
+   Va[17] = Vc4[1];
+   Va[18] = Vc4[2];
+   Va[19] = Vc4[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+
+   VbS0 = vec_splat(Vb[6], 0);
+   VbS1 = vec_splat(Vb[6], 1);
+   VbS2 = vec_splat(Vb[6], 2);
+   VbS3 = vec_splat(Vb[6], 3);
+
+   Vc3[0] = vec_mul(VbS3, Vc3[0]);
+   Vc3[1] = vec_mul(VbS3, Vc3[1]);
+   Vc3[2] = vec_mul(VbS3, Vc3[2]);
+   Vc3[3] = vec_mul(VbS3, Vc3[3]);
+   Va[12] = Vc3[0];
+   Va[13] = Vc3[1];
+   Va[14] = Vc3[2];
+   Va[15] = Vc3[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[4], 2);
+
+   Vc2[0] = vec_mul(VbS2, Vc2[0]);
+   Vc2[1] = vec_mul(VbS2, Vc2[1]);
+   Vc2[2] = vec_mul(VbS2, Vc2[2]);
+   Vc2[3] = vec_mul(VbS2, Vc2[3]);
+   Va[ 8] = Vc2[0];
+   Va[ 9] = Vc2[1];
+   Va[10] = Vc2[2];
+   Va[11] = Vc2[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+
+   VbS0 = vec_splat(Vb[2], 0);
+   VbS1 = vec_splat(Vb[2], 1);
+
+   Vc1[0] = vec_mul(VbS1, Vc1[0]);
+   Vc1[1] = vec_mul(VbS1, Vc1[1]);
+   Vc1[2] = vec_mul(VbS1, Vc1[2]);
+   Vc1[3] = vec_mul(VbS1, Vc1[3]);
+   Va[4] = Vc1[0];
+   Va[5] = Vc1[1];
+   Va[6] = Vc1[2];
+   Va[7] = Vc1[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+
+   VbS0 = vec_splat(Vb[0], 0);
+
+   Vc0[0] = vec_mul(VbS0, Vc0[0]);
+   Vc0[1] = vec_mul(VbS0, Vc0[1]);
+   Vc0[2] = vec_mul(VbS0, Vc0[2]);
+   Vc0[3] = vec_mul(VbS0, Vc0[3]);
+   Va[0] = Vc0[0];
+   Va[1] = Vc0[1];
+   Va[2] = Vc0[2];
+   Va[3] = Vc0[3];
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+#if 0
+  fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+	aa  = a;
+	b -= j * k  * COMPSIZE;
+	c -= j * ldc* COMPSIZE;
+	cc  = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+	if (i > 0) {
+
+	  do {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			  b  +  j            * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(GEMM_UNROLL_M, j,
+		  aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE,
+		  b  + (kk - j) * j             * COMPSIZE,
+		  cc, ldc);
+
+	    aa += GEMM_UNROLL_M * k * COMPSIZE;
+	    cc += GEMM_UNROLL_M     * COMPSIZE;
+	    i --;
+	  } while (i > 0);
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  do {
+	    if (m & i) {
+
+	      if (k - kk > 0) {
+		GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa + i * kk * COMPSIZE,
+			    b  + j * kk * COMPSIZE,
+			    cc, ldc);
+	      }
+
+	      solve(i, j,
+		    aa + (kk - j) * i * COMPSIZE,
+		    b  + (kk - j) * j * COMPSIZE,
+		    cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+	    }
+	    i >>= 1;
+	  } while (i > 0);
+	}
+	kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = (m >> GEMM_UNROLL_M_SHIFT);
+      if (i > 0) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  if (well_aligned) { 
+#ifdef DOUBLE
+	  solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		   b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		   b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	  }
+	  else {
+	  solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+	  }
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & (GEMM_UNROLL_M - 1)) {
+	i = (GEMM_UNROLL_M >> 1);
+	do {
+	  if (m & i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+	  }
+	  i >>= 1;
+	} while (i > 0);
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c
index 8e593bbfa..b03508b09 100644
--- a/kernel/power/zaxpy_microk_power10.c
+++ b/kernel/power/zaxpy_microk_power10.c
@@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
 			    double alpha_r, double alpha_i)
 {
 #if !defined(CONJ)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  static const double mvec[2] = { -1.0, 1.0 };
+#else
+  static const double mvec[2] = { 1.0, -1.0 };
+#endif
+#else
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   static const double mvec[2] = { 1.0, -1.0 };
 #else
   static const double mvec[2] = { -1.0, 1.0 };
+#endif
 #endif
   const double *mvecp = mvec;
 
diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S
index fca389e69..afee8f183 100644
--- a/kernel/power/zgemm_kernel_power10.S
+++ b/kernel/power/zgemm_kernel_power10.S
@@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     std    r0, FLINK_SAVE(SP)
  
 
-#if defined(linux) || defined(__FreeBSD__)
+#if defined(linux) || defined(__FreeBSD__) || defined(_AIX)
 	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 
 
 #ifdef TRMMKERNEL
-#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif 
 #endif
diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S
index 42f9c5ad4..e5e5ec0e6 100644
--- a/kernel/power/zgemm_macros_power10.S
+++ b/kernel/power/zgemm_macros_power10.S
@@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef TRMMKERNEL 
   lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
   lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 
+  xxmrghd  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+  xxmrgld  \VS_OUT2,\VS_TEMP1,\VS_TEMP2
+#else
   xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
   xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
+#endif
 #endif	
 .endm
 /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
 
 
 .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xxmrghd \VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
+    xxmrgld \VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+#else
 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+#endif
 .endm 
 /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
 
 
 .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xxmrghd \VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
+    xxmrgld \VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+#else
 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+#endif
 .endm
 /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
 
@@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xxmrghd  \VSOUT1,\VSIN1,\VSIN2
+    xxmrgld  \VSOUT2,\VSIN1,\VSIN2
+#else
 	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
 	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
+#endif
 .endm
 
 
@@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
 #ifndef TRMMKERNEL 
   lxv	vs50,	(\LOFFSET)(\BASE_REG) 
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  xxmrghd  vs46,vs50,vs50
+  xxmrgld  vs47,vs50,vs50
+#else
   xxmrgld  vs46,vs50,vs50
   xxmrghd  vs47,vs50,vs50	
+#endif
 #endif	
   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs36,vs37	
   AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
   MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
   MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
   UNPACK_FOR_STORE	vs46,vs47,vs39,vs41 
+#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
   xxmrghd  vs39,vs47,vs46	
+#endif
   stxv	vs39,	(\LOFFSET)(\BASE_REG) 
 .endm
 
@@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs44,	DISP16(\Index,192)(AO)	// load real,imag from A
 	lxvp	vs46,	DISP16(\Index,224)(AO)	// load real,imag from A
  	lxvp	vs50,	DISP4(\Index,  32)(BO)	// load real,imag from B
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp  0,	vs32,	vs48
+	xvf64gerpp  1,  vs34,   vs48
+	xvf64gerpp  2,  vs36,   vs48
+	xvf64gerpp  3,  vs38,   vs48
+	xvf64gerpp  4,  vs32,   vs49
+	xvf64gerpp  5,  vs34,   vs49
+	xvf64gerpp  6,  vs36,   vs49
+	xvf64gerpp  7,  vs38,   vs49
+#else
 	xvf64gerpp	0,	vs32,	vs49
 	xvf64gerpp	1,	vs34,	vs49
 	xvf64gerpp	2,	vs36,	vs49
@@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf64gerpp	5,	vs34,	vs48
 	xvf64gerpp	6,	vs36,	vs48
 	xvf64gerpp	7,	vs38,	vs48
+#endif
 	lxvp	vs32,	DISP16(\Index, 256)(AO)	// load real,imag from A
 	lxvp	vs34,	DISP16(\Index, 288)(AO)	// load real,imag from A
 	lxvp	vs36,	DISP16(\Index, 320)(AO)	// load real,imag from A
 	lxvp	vs38,	DISP16(\Index, 352)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP4(\Index,  64)(BO)	// load real imag from B
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp  0,  vs40,   vs50
+	xvf64gerpp  1,  vs42,   vs50
+	xvf64gerpp  2,  vs44,   vs50
+	xvf64gerpp  3,  vs46,   vs50
+	xvf64gerpp  4,  vs40,   vs51
+	xvf64gerpp  5,  vs42,   vs51
+	xvf64gerpp  6,  vs44,   vs51
+	xvf64gerpp  7,  vs46,   vs51
+#else
 	xvf64gerpp	0,	vs40,	vs51
 	xvf64gerpp	1,	vs42,	vs51
 	xvf64gerpp	2,	vs44,	vs51
@@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf64gerpp	5,	vs42,	vs50
 	xvf64gerpp	6,	vs44,	vs50
 	xvf64gerpp	7,	vs46,	vs50
+#endif
 .if \IsLast==1
 	addi	AO, AO,  DISP16(\Index,256)
 	addi	BO, BO,  DISP4(\Index,64)
@@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro LOAD_END_2x8  OffsetA,OffsetB
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp  0,  vs32,   vs48
+	xvf64gerpp  1,  vs34,   vs48
+	xvf64gerpp  2,  vs36,   vs48
+	xvf64gerpp  3,  vs38,   vs48
+	xvf64gerpp  4,  vs32,   vs49
+	xvf64gerpp  5,  vs34,   vs49
+	xvf64gerpp  6,  vs36,   vs49
+	xvf64gerpp  7,  vs38,   vs49	
+#else
 	xvf64gerpp	0,	vs32,	vs49
 	xvf64gerpp	1,	vs34,	vs49
 	xvf64gerpp	2,	vs36,	vs49
@@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvf64gerpp	5,	vs34,	vs48
 	xvf64gerpp	6,	vs36,	vs48
 	xvf64gerpp	7,	vs38,	vs48
+#endif
 	addi	BO, BO, \OffsetB
 	addi	AO, AO, \OffsetA
 .endm
@@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs12, vs13, 0b10
         xxpermdi vs46, vs14, vs15, 0b01
         xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+	xxlor vs8, vs40, vs40
+	xxlor vs9, vs41, vs41
+	xxlor vs10, vs42, vs42
+	xxlor vs11, vs43, vs43
+	xxlor vs12, vs44, vs44
+	xxlor vs13, vs45, vs45
+	xxlor vs14, vs46, vs46
+	xxlor vs15, vs47, vs47
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs15, vs45, vs45
 	xxlor vs12, vs46, vs46
 	xxlor vs13, vs47, vs47
-
+#endif
         xxpermdi vs32, vs16, vs17, 0b01
         xxpermdi vs33, vs16, vs17, 0b10
         xxpermdi vs34, vs18, vs19, 0b01
@@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs28, vs29, 0b10
         xxpermdi vs46, vs30, vs31, 0b01
         xxpermdi vs47, vs30, vs31, 0b10
-       
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs16, vs32, vs32
+	xxlor vs17, vs33, vs33
+	xxlor vs18, vs34, vs34
+	xxlor vs19, vs35, vs35
+	xxlor vs20, vs36, vs36
+	xxlor vs21, vs37, vs37
+	xxlor vs22, vs38, vs38
+	xxlor vs23, vs39, vs39
+	xxlor vs24, vs40, vs40
+	xxlor vs25, vs41, vs41
+	xxlor vs26, vs42, vs42
+	xxlor vs27, vs43, vs43
+	xxlor vs28, vs44, vs44
+	xxlor vs29, vs45, vs45
+	xxlor vs30, vs46, vs46
+	xxlor vs31, vs47, vs47
+#else
 	xxlor vs18, vs32, vs32
 	xxlor vs19, vs33, vs33
 	xxlor vs16, vs34, vs34
@@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs31, vs45, vs45
 	xxlor vs28, vs46, vs46
 	xxlor vs29, vs47, vs47
-
+#endif
 	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
 	SAVE8  vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0  
 	addi	CO, CO, 128
@@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs40,	DISP8(\Index,  64)(AO)	// load real,imag from A
 	lxvp	vs42,	DISP8(\Index,  96)(AO)	// load real,imag from A
  	lxvp	vs50,	DISP4(\Index,  32)(BO)  // load real,imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      2,      vs32,   vs48
-        xvf64gerpp      3,      vs34,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xvf64gerpp      0,      vs32,   vs48
+    xvf64gerpp      1,      vs34,   vs48
+    xvf64gerpp      2,      vs32,   vs49
+    xvf64gerpp      3,      vs34,   vs49
+#else
+    xvf64gerpp      0,      vs32,   vs49
+    xvf64gerpp      1,      vs34,   vs49
+    xvf64gerpp      2,      vs32,   vs48
+    xvf64gerpp      3,      vs34,   vs48
+#endif
 	lxvp	vs32,	DISP8(\Index, 128)(AO)	// load real,imag from A
 	lxvp	vs34,	DISP8(\Index, 160)(AO)	// load real,imag from A
  	lxvp	vs48,	DISP4(\Index,  64)(BO)  // load real,imag from B
-        xvf64gerpp      0,      vs40,   vs51 
-        xvf64gerpp      1,      vs42,   vs51
-        xvf64gerpp      2,      vs40,   vs50
-        xvf64gerpp      3,      vs42,   vs50
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    xvf64gerpp      0,      vs40,   vs50
+    xvf64gerpp      1,      vs42,   vs50
+    xvf64gerpp      2,      vs40,   vs51
+    xvf64gerpp      3,      vs42,   vs51
+#else
+    xvf64gerpp      0,      vs40,   vs51 
+    xvf64gerpp      1,      vs42,   vs51
+    xvf64gerpp      2,      vs40,   vs50
+    xvf64gerpp      3,      vs42,   vs50
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP8(\Index,128)
 	addi	BO, BO, DISP4(\Index,64)
@@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
 
 .macro LOAD_END_2x4	OffsetA, OffsetB
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      2,      vs32,   vs48
-        xvf64gerpp      3,      vs34,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp      0,      vs32,   vs48
+	xvf64gerpp      1,      vs34,   vs48
+	xvf64gerpp      2,      vs32,   vs49
+	xvf64gerpp      3,      vs34,   vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs34,   vs49
+	xvf64gerpp      2,      vs32,   vs48
+	xvf64gerpp      3,      vs34,   vs48
+#endif
 	addi	BO, BO, \OffsetB
 	addi	AO, AO, \OffsetA
 .endm
@@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs12, vs13, 0b10
         xxpermdi vs46, vs14, vs15, 0b01
         xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36 
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+	xxlor vs8, vs40, vs40
+	xxlor vs9, vs41, vs41
+	xxlor vs10, vs42, vs42
+	xxlor vs11, vs43, vs43
+	xxlor vs12, vs44, vs44
+	xxlor vs13, vs45, vs45
+	xxlor vs14, vs46, vs46
+	xxlor vs15, vs47, vs47
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs15, vs45, vs45
 	xxlor vs12, vs46, vs46
 	xxlor vs13, vs47, vs47
-
+#endif
 	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
 	SAVE4  vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0  
 	addi	CO, CO, 64
@@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL2x2_2 Index, IsLast
 	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
  	lxvp	vs50,	DISP4(\Index, 32)(BO)	// load real,imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs32,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs32,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs32,   vs48
+#endif
 	lxvp	vs32,	DISP4(\Index, 64)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP4(\Index, 64)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs40,   vs51
-        xvf64gerpp      1,      vs40,   vs50
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs40,	vs50
+	xvf64gerpp	1,	vs40,	vs51
+#else
+	xvf64gerpp      0,      vs40,   vs51
+	xvf64gerpp      1,      vs40,   vs50
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP4(\Index,64)
 	addi	BO, BO, DISP4(\Index,64)
@@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  
 .macro LOAD_END_2x2  OffsetA,OffsetB
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs32,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs32,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs32,   vs48
+#endif
 	addi	BO, BO, \OffsetB
 	addi	AO, AO, \OffsetA
 .endm
@@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs37, vs4, vs5, 0b10
         xxpermdi vs38, vs6, vs7, 0b01
         xxpermdi vs39, vs6, vs7, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs7, vs37, vs37
 	xxlor vs4, vs38, vs38
 	xxlor vs5, vs39, vs39
-
+#endif
 	SAVE2  vs0,vs1,vs2,vs3,CO,0
 	SAVE2  vs4,vs5,vs6,vs7,T1,0 
 	addi	CO, CO, 32 
@@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs44,	DISP16(\Index, 192)(AO)	// load real,imag from A
 	lxvp	vs46,	DISP16(\Index, 224)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP2(\Index,    0)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      2,      vs36,   vs49
-        xvf64gerpp      3,      vs38,   vs49
-        xvf64gerpp      0,      vs40,   vs48
-        xvf64gerpp      1,      vs42,   vs48
-        xvf64gerpp      2,      vs44,   vs48
-        xvf64gerpp      3,      vs46,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs34,	vs48
+	xvf64gerpp	2,	vs36,	vs48
+	xvf64gerpp	3,	vs38,	vs48
+	xvf64gerpp	0,	vs40,	vs49
+	xvf64gerpp	1,	vs42,	vs49
+	xvf64gerpp	2,	vs44,	vs49
+	xvf64gerpp	3,	vs46,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs34,   vs49
+	xvf64gerpp      2,      vs36,   vs49
+	xvf64gerpp      3,      vs38,   vs49
+	xvf64gerpp      0,      vs40,   vs48
+	xvf64gerpp      1,      vs42,   vs48
+	xvf64gerpp      2,      vs44,   vs48
+	xvf64gerpp      3,      vs46,   vs48
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP16(\Index,256)
 	addi	BO, BO,  DISP2(\Index,32)
@@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs45, vs12, vs13, 0b10
         xxpermdi vs46, vs14, vs15, 0b01
         xxpermdi vs47, vs14, vs15, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+	xxlor vs8, vs40, vs40
+	xxlor vs9, vs41, vs41
+	xxlor vs10, vs42, vs42
+	xxlor vs11, vs43, vs43
+	xxlor vs12, vs44, vs44
+	xxlor vs13, vs45, vs45
+	xxlor vs14, vs46, vs46
+	xxlor vs15, vs47, vs47
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs15, vs45, vs45
 	xxlor vs12, vs46, vs46
 	xxlor vs13, vs47, vs47
-
+#endif
 	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
 	addi	CO, CO, 128
 .endm
@@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs40,	DISP8(\Index, 64)(AO)	// load real,imag from A
 	lxvp	vs42,	DISP8(\Index, 96)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      1,      vs34,   vs49
-        xvf64gerpp      0,      vs40,   vs48
-        xvf64gerpp      1,      vs42,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	1,	vs34,	vs48
+	xvf64gerpp	0,	vs40,	vs49
+	xvf64gerpp	1,	vs42,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      1,      vs34,   vs49
+	xvf64gerpp      0,      vs40,   vs48
+	xvf64gerpp      1,      vs42,   vs48
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP8(\Index,128)
 	addi	BO, BO,  DISP2(\Index,32)
@@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs37, vs4, vs5, 0b10
         xxpermdi vs38, vs6, vs7, 0b01
         xxpermdi vs39, vs6, vs7, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+	xxlor vs4, vs36, vs36
+	xxlor vs5, vs37, vs37
+	xxlor vs6, vs38, vs38
+	xxlor vs7, vs39, vs39
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
@@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlor vs7, vs37, vs37
 	xxlor vs4, vs38, vs38
 	xxlor vs5, vs39, vs39
-
+#endif
 	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
 	addi	CO, CO, 64
 .endm
@@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	lxvp	vs32,	DISP4(\Index,  0)(AO)	// load real,imag from A
 	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
-        xvf64gerpp      0,      vs32,   vs49
-        xvf64gerpp      0,      vs40,   vs48
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xvf64gerpp	0,	vs32,	vs48
+	xvf64gerpp	0,	vs40,	vs49
+#else
+	xvf64gerpp      0,      vs32,   vs49
+	xvf64gerpp      0,      vs40,   vs48
+#endif
 .if \IsLast==1
 	addi	AO, AO, DISP4(\Index,64)
 	addi	BO, BO, DISP2(\Index,32)
@@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         xxpermdi vs33, vs0, vs1, 0b10
         xxpermdi vs34, vs2, vs3, 0b01
         xxpermdi vs35, vs2, vs3, 0b10
-
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	xxlor vs0, vs32, vs32
+	xxlor vs1, vs33, vs33
+	xxlor vs2, vs34, vs34
+	xxlor vs3, vs35, vs35
+#else
 	xxlor vs2, vs32, vs32
 	xxlor vs3, vs33, vs33
 	xxlor vs0, vs34, vs34
 	xxlor vs1, vs35, vs35
+#endif
 
 	SAVE2  vs0,vs1,vs2,vs3,CO,0
 	addi	CO, CO, 32 
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index 708f1318d..48f49f97b 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -155,6 +155,11 @@
 #define PREFETCHSIZE_C  16
 #endif
 
+#ifdef POWER3
+#define PREFETCHSIZE_A  34
+#define PREFETCHSIZE_C  16
+#endif
+
 #ifdef POWER4
 #define PREFETCHSIZE_A  34
 #define PREFETCHSIZE_C  16
diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c
index 1f7199c89..366c21681 100644
--- a/kernel/power/zgemv_n_4.c
+++ b/kernel/power/zgemv_n_4.c
@@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
     BLASLONG i;
-    BLASLONG j;
     FLOAT *a_ptr;
     FLOAT *x_ptr;
     FLOAT *y_ptr;
diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c
new file mode 100644
index 000000000..a545b00d8
--- /dev/null
+++ b/kernel/power/zgemv_n_power10.c
@@ -0,0 +1,1101 @@
+/***************************************************************************
+Copyright (c) 2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
+#define HAVE_KERNEL_4x4_VEC 1
+#define HAVE_KERNEL_4x2_VEC 1
+#define HAVE_KERNEL_4x1_VEC 1
+#define HAVE_KERNEL_ADDY 1
+
+#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
+#include <altivec.h> 
+#endif
+#endif
+
+// 
+#define NBMAX 4096
+
+#ifdef HAVE_KERNEL_4x4_VEC_ASM
+
+#elif HAVE_KERNEL_4x4_VEC
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define SAVE_RESULT(ACC, J)  \
+        __builtin_mma_disassemble_acc ((void *)result, ACC); \
+        result[0][0] = result[0][0] - result[1][1]; \
+	result[0][1] = result[0][1] + result[1][0]; \
+	result[1][0] = result[2][0] - result[3][1]; \
+	result[1][1] = result[2][1] + result[3][0]; \
+	rowC = (v4sf_t *) &y[i2 + J]; \
+	rowC[0] += result[0]; \
+	rowC[1] += result[1];
+#else
+#define SAVE_RESULT(ACC, J)  \
+        __builtin_mma_disassemble_acc ((void *)result, ACC); \
+        result[0][0] = result[0][0] + result[1][1]; \
+	result[0][1] = result[0][1] - result[1][0]; \
+	result[1][0] = result[2][0] + result[3][1]; \
+	result[1][1] = result[2][1] - result[3][0]; \
+	rowC = (v4sf_t *) &y[i2 + J]; \
+	rowC[0] += result[0]; \
+	rowC[1] += result[1];
+#endif
+
+static void zgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector_quad acc0, acc1, acc2, acc3;
+    v4sf_t result[4];
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+
+    register __vector double vx0_r = {x[0], x[1]};
+    register __vector double vx1_r = {x[2], x[3]};
+    register __vector double vx2_r = {x[4], x[5]};
+    register __vector double vx3_r = {x[6], x[7]};
+    register __vector double vx4_r = {x[8], x[9]};
+    register __vector double vx5_r = {x[10], x[11]};
+    register __vector double vx6_r = {x[12], x[13]};
+    register __vector double vx7_r = {x[14], x[15]};
+    __vector_pair *Va0, *Va1, *Va2, *Va3;
+    __vector_pair *Va4, *Va5, *Va6, *Va7;
+    BLASLONG  i = 0, i2 = 0;
+    v4sf_t *rowC;
+    BLASLONG tmp = (n / 8) * 8;
+    for (i = 0; i < tmp; i += 8) {
+	i2 = i*2;
+	Va0  = ((__vector_pair*)((void*)&a0[i2]));
+	Va1  = ((__vector_pair*)((void*)&a1[i2]));
+	Va2  = ((__vector_pair*)((void*)&a2[i2]));
+	Va3  = ((__vector_pair*)((void*)&a3[i2]));
+	Va4  = ((__vector_pair*)((void*)&a4[i2]));
+	Va5  = ((__vector_pair*)((void*)&a5[i2]));
+	Va6  = ((__vector_pair*)((void*)&a6[i2]));
+	Va7  = ((__vector_pair*)((void*)&a7[i2]));
+
+	__builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r);
+	__builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r);
+	__builtin_mma_xvf64ger (&acc2, Va0[2], (vec_t ) vx0_r);
+	__builtin_mma_xvf64ger (&acc3, Va0[3], (vec_t ) vx0_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va1[2], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va1[3], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va2[2], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va2[3], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va3[2], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va3[3], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va4[2], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va4[3], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va5[2], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va5[3], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va6[2], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va6[3], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc2, Va7[2], (vec_t ) vx7_r);
+	__builtin_mma_xvf64gerpp (&acc3, Va7[3], (vec_t ) vx7_r);
+	SAVE_RESULT(&acc0, 0);
+	SAVE_RESULT(&acc1, 4);
+	SAVE_RESULT(&acc2, 8);
+	SAVE_RESULT(&acc3, 12);
+    }
+    while (i < n) {
+	i2 = i*2;
+	Va0  = ((__vector_pair*)((void*)&a0[i2]));
+	Va1  = ((__vector_pair*)((void*)&a1[i2]));
+	Va2  = ((__vector_pair*)((void*)&a2[i2]));
+	Va3  = ((__vector_pair*)((void*)&a3[i2]));
+	Va4  = ((__vector_pair*)((void*)&a4[i2]));
+	Va5  = ((__vector_pair*)((void*)&a5[i2]));
+	Va6  = ((__vector_pair*)((void*)&a6[i2]));
+	Va7  = ((__vector_pair*)((void*)&a7[i2]));
+
+	__builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r);
+	__builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r);
+	__builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r);
+	__builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r);
+	SAVE_RESULT(&acc0, 0);
+	SAVE_RESULT(&acc1, 4);
+	i += 4;
+    }
+}
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+  
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector double vx0_r = {x[0], x[0]};
+    register __vector double vx0_i = {-x[1], x[1]};
+    register __vector double vx1_r = {x[2], x[2]};
+    register __vector double vx1_i = {-x[3], x[3]};
+    register __vector double vx2_r = {x[4], x[4]};
+    register __vector double vx2_i = {-x[5], x[5]};
+    register __vector double vx3_r = {x[6], x[6]};
+    register __vector double vx3_i = {-x[7], x[7]};
+
+#else
+    register __vector double vx0_r = {x[0], -x[0]};
+    register __vector double vx0_i = {x[1], x[1]};
+    register __vector double vx1_r = {x[2], -x[2]};
+    register __vector double vx1_i = {x[3], x[3]};
+    register __vector double vx2_r = {x[4], -x[4]};
+    register __vector double vx2_i = {x[5], x[5]};
+    register __vector double vx3_r = {x[6], -x[6]};
+    register __vector double vx3_i = {x[7], x[7]};
+#endif
+
+    register __vector double *vy = (__vector double *) y;
+    register __vector double *vptr_a0 = (__vector double *) a0;
+    register __vector double *vptr_a1 = (__vector double *) a1;
+    register __vector double *vptr_a2 = (__vector double *) a2;
+    register __vector double *vptr_a3 = (__vector double *) a3;
+ 
+   
+    register __vector double vy_0;
+    register __vector double va0;
+    register __vector double va1;
+    register __vector double va2;
+    register __vector double va3;
+    register __vector double vy_1;
+    register __vector double va0_1;
+    register __vector double va1_1;
+    register __vector double va2_1;
+    register __vector double va3_1;
+    register __vector double vy_2;
+    register __vector double va0_2;
+    register __vector double va1_2;
+    register __vector double va2_2;
+    register __vector double va3_2;
+    register __vector double vy_3;
+    register __vector double va0_3;
+    register __vector double va1_3;
+    register __vector double va2_3;
+    register __vector double va3_3;
+      
+     BLASLONG  i = 0; 
+    while (i < n) {
+
+        vy_0 = vy[i];
+        va0 = vptr_a0[i];
+        va1 = vptr_a1[i];
+        va2 = vptr_a2[i];
+        va3 = vptr_a3[i];
+
+        vy_1 = vy[i + 1];
+        va0_1 = vptr_a0[i + 1];
+        va1_1 = vptr_a1[i + 1];
+        va2_1 = vptr_a2[i + 1];
+        va3_1 = vptr_a3[i + 1];
+
+        vy_2 = vy[i + 2];
+        va0_2 = vptr_a0[i + 2];
+        va1_2 = vptr_a1[i + 2];
+        va2_2 = vptr_a2[i + 2];
+        va3_2 = vptr_a3[i + 2];
+
+        vy_3 = vy[i + 3];
+        va0_3 = vptr_a0[i + 3];
+        va1_3 = vptr_a1[i + 3];
+        va2_3 = vptr_a2[i + 3];
+        va3_3 = vptr_a3[i + 3];
+
+        vy_0 += va0*vx0_r;
+        vy_1 += va0_1*vx0_r;
+        vy_2 += va0_2*vx0_r;
+        vy_3 += va0_3*vx0_r;
+ 
+
+        vy_0 += va1*vx1_r;
+        vy_1 += va1_1*vx1_r;
+        vy_2 += va1_2*vx1_r;
+        vy_3 += va1_3*vx1_r;
+
+        va0 = vec_xxpermdi(va0, va0, 2);
+        va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
+
+
+        vy_0 += va2*vx2_r;
+        vy_1 += va2_1*vx2_r;
+        va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
+        vy_2 += va2_2*vx2_r;
+        vy_3 += va2_3*vx2_r;
+
+        va1 = vec_xxpermdi(va1, va1, 2);
+        va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
+
+
+        vy_0 += va3*vx3_r;
+        vy_1 += va3_1*vx3_r;
+
+        va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
+        va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
+
+        vy_2 += va3_2*vx3_r;
+        vy_3 += va3_3*vx3_r;
+
+        va2 = vec_xxpermdi(va2, va2, 2);
+        va2_1 = vec_xxpermdi(va2_1, va2_1, 2);
+
+
+        vy_0 += va0*vx0_i;
+        vy_1 += va0_1*vx0_i;
+
+        va2_2 = vec_xxpermdi(va2_2, va2_2, 2);
+        va2_3 = vec_xxpermdi(va2_3, va2_3, 2);
+
+        vy_2 += va0_2*vx0_i;
+        vy_3 += va0_3*vx0_i;
+
+        va3 = vec_xxpermdi(va3, va3, 2);
+        va3_1 = vec_xxpermdi(va3_1, va3_1, 2);
+
+
+        vy_0 += va1*vx1_i;
+        vy_1 += va1_1*vx1_i;
+
+        va3_2 = vec_xxpermdi(va3_2, va3_2, 2);
+        va3_3 = vec_xxpermdi(va3_3, va3_3, 2);
+
+        vy_2 += va1_2*vx1_i;
+        vy_3 += va1_3*vx1_i;
+
+        vy_0 += va2*vx2_i;
+        vy_1 += va2_1*vx2_i;
+        vy_2 += va2_2*vx2_i;
+        vy_3 += va2_3*vx2_i;
+
+        vy_0 += va3*vx3_i;
+        vy_1 += va3_1*vx3_i;
+        vy_2 += va3_2*vx3_i;
+        vy_3 += va3_3*vx3_i;
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+
+
+        i += 4;
+         
+       
+    }
+
+}
+#else
+
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+
+    for (i = 0; i < 2 * n; i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
+        y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
+        y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
+        y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
+        y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
+#else 
+        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
+        y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
+        y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
+        y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
+        y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
+#endif
+    }
+}
+
+#endif
+
+#ifdef  HAVE_KERNEL_4x2_VEC
+
+static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector double vx0_r = {x[0], x[0]};
+    register __vector double vx0_i = {-x[1], x[1]};
+    register __vector double vx1_r = {x[2], x[2]};
+    register __vector double vx1_i = {-x[3], x[3]};
+
+#else
+    register __vector double vx0_r = {x[0], -x[0]};
+    register __vector double vx0_i = {x[1], x[1]};
+    register __vector double vx1_r = {x[2], -x[2]};
+    register __vector double vx1_i = {x[3], x[3]};
+#endif
+
+
+    register __vector double *vy = (__vector double *) y;
+    register __vector double *vptr_a0 = (__vector double *) a0;
+    register __vector double *vptr_a1 = (__vector double *) a1;
+
+    for (i = 0; i < n; i += 4) {
+
+        register __vector double vy_0 = vy[i];
+        register __vector double vy_1 = vy[i + 1];
+        register __vector double vy_2 = vy[i + 2];
+        register __vector double vy_3 = vy[i + 3];
+
+        register __vector double va0 = vptr_a0[i];
+        register __vector double va0_1 = vptr_a0[i + 1];
+        register __vector double va0_2 = vptr_a0[i + 2];
+        register __vector double va0_3 = vptr_a0[i + 3];
+
+        register __vector double va1 = vptr_a1[i];
+        register __vector double va1_1 = vptr_a1[i + 1];
+        register __vector double va1_2 = vptr_a1[i + 2];
+        register __vector double va1_3 = vptr_a1[i + 3];
+
+        vy_0 += va0*vx0_r;
+        vy_1 += va0_1*vx0_r;
+        vy_2 += va0_2*vx0_r;
+        vy_3 += va0_3*vx0_r;
+
+        va0 = vec_xxpermdi(va0, va0, 2);
+        va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
+        va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
+
+        vy_0 += va1*vx1_r;
+        vy_1 += va1_1*vx1_r;
+        vy_2 += va1_2*vx1_r;
+        vy_3 += va1_3*vx1_r;
+
+        va1 = vec_xxpermdi(va1, va1, 2);
+        va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
+        va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
+        va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
+
+        vy_0 += va0*vx0_i;
+        vy_1 += va0_1*vx0_i;
+        vy_2 += va0_2*vx0_i;
+        vy_3 += va0_3*vx0_i;
+
+        vy_0 += va1*vx1_i;
+        vy_1 += va1_1*vx1_i;
+        vy_2 += va1_2*vx1_i;
+        vy_3 += va1_3*vx1_i;
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+
+    }
+}
+#else
+
+static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+
+    for (i = 0; i < 2 * n; i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
+#else 
+        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+        y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
+        y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
+#endif
+    }
+}
+
+#endif
+
+#ifdef  HAVE_KERNEL_4x1_VEC
+
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register __vector double vx0_r = {x[0], x[0]};
+    register __vector double vx0_i = {-x[1], x[1]};
+
+#else
+    register __vector double vx0_r = {x[0], -x[0]};
+    register __vector double vx0_i = {x[1], x[1]};
+#endif
+
+
+    register __vector double *vy = (__vector double *) y;
+    register __vector double *vptr_a0 = (__vector double *) a0;
+
+    for (i = 0; i < n; i += 4) {
+
+        register __vector double vy_0 = vy[i];
+        register __vector double vy_1 = vy[i + 1];
+        register __vector double vy_2 = vy[i + 2];
+        register __vector double vy_3 = vy[i + 3];
+
+        register __vector double va0 = vptr_a0[i];
+        register __vector double va0_1 = vptr_a0[i + 1];
+        register __vector double va0_2 = vptr_a0[i + 2];
+        register __vector double va0_3 = vptr_a0[i + 3];
+
+        register __vector double va0x = vec_xxpermdi(va0, va0, 2);
+        register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
+        register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
+        register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
+        vy_0 += va0*vx0_r + va0x*vx0_i;
+        vy_1 += va0_1*vx0_r +  va0x_1*vx0_i;
+        vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
+        vy_3 += va0_3*vx0_r + va0x_3*vx0_i; 
+
+        vy[i] = vy_0;
+        vy[i + 1] = vy_1;
+        vy[i + 2] = vy_2;
+        vy[i + 3] = vy_3;
+
+    }
+}
+
+#else
+
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+
+    for (i = 0; i < 2 * n; i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
+#else 
+        y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
+        y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
+#endif
+
+    }
+}
+
+#endif
+
+#ifdef HAVE_KERNEL_ADDY
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+
+
+#if   !defined(XCONJ) 
+
+    register __vector double valpha_r = {alpha_r, alpha_r};
+    register __vector double valpha_i = {-alpha_i, alpha_i};
+
+#else
+    register __vector double valpha_r = {alpha_r, -alpha_r};
+    register __vector double valpha_i = {alpha_i, alpha_i};
+#endif
+
+    register __vector double *vptr_src = (__vector double *) src;
+    if (inc_dest != 2) {
+        register __vector double *vptr_y = (__vector double *) dest;
+        //note that inc_dest is already 2x. so we should add it to double*
+        register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
+        register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
+        register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
+        BLASLONG dest_t = 0;
+        BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4  we just multiply 2 times
+        for (i = 0; i < n; i += 4) {
+
+            register __vector double vy_0 = vptr_y[dest_t];
+            register __vector double vy_1 = vptr_y1[dest_t];
+            register __vector double vy_2 = vptr_y2[dest_t];
+            register __vector double vy_3 = vptr_y3[dest_t];
+
+            register __vector double vsrc = vptr_src[i];
+            register __vector double vsrc_1 = vptr_src[i + 1];
+            register __vector double vsrc_2 = vptr_src[i + 2];
+            register __vector double vsrc_3 = vptr_src[i + 3];
+
+            vy_0 += vsrc*valpha_r;
+            vy_1 += vsrc_1*valpha_r;
+            vy_2 += vsrc_2*valpha_r;
+            vy_3 += vsrc_3*valpha_r;
+
+            vsrc = vec_xxpermdi(vsrc, vsrc, 2);
+            vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
+            vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
+            vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
+
+            vy_0 += vsrc*valpha_i;
+            vy_1 += vsrc_1*valpha_i;
+            vy_2 += vsrc_2*valpha_i;
+            vy_3 += vsrc_3*valpha_i;
+
+            vptr_y[dest_t] = vy_0;
+            vptr_y1[dest_t ] = vy_1;
+            vptr_y2[dest_t] = vy_2;
+            vptr_y3[dest_t] = vy_3;
+
+            dest_t += add_dest;
+
+        }
+
+        return;
+    } else {
+        register __vector double *vptr_y = (__vector double *) dest;
+        for (i = 0; i < n; i += 4) {
+
+            register __vector double vy_0 = vptr_y[i];
+            register __vector double vy_1 = vptr_y[i + 1];
+            register __vector double vy_2 = vptr_y[i + 2];
+            register __vector double vy_3 = vptr_y[i + 3];
+
+            register __vector double vsrc = vptr_src[i];
+            register __vector double vsrc_1 = vptr_src[i + 1];
+            register __vector double vsrc_2 = vptr_src[i + 2];
+            register __vector double vsrc_3 = vptr_src[i + 3];
+
+            vy_0 += vsrc*valpha_r;
+            vy_1 += vsrc_1*valpha_r;
+            vy_2 += vsrc_2*valpha_r;
+            vy_3 += vsrc_3*valpha_r;
+
+            vsrc = vec_xxpermdi(vsrc, vsrc, 2);
+            vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
+            vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
+            vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
+
+            vy_0 += vsrc*valpha_i;
+            vy_1 += vsrc_1*valpha_i;
+            vy_2 += vsrc_2*valpha_i;
+            vy_3 += vsrc_3*valpha_i;
+
+            vptr_y[i] = vy_0;
+            vptr_y[i + 1 ] = vy_1;
+            vptr_y[i + 2] = vy_2;
+            vptr_y[i + 3] = vy_3;
+
+        }
+
+        return;
+    }
+    return;
+}
+
+#else
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+
+    if (inc_dest != 2) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        for (i = 0; i < n; i++) {
+#if !defined(XCONJ) 
+            temp_r = alpha_r * src[0] - alpha_i * src[1];
+            temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+            temp_r = alpha_r * src[0] + alpha_i * src[1];
+            temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+            *dest += temp_r;
+            *(dest + 1) += temp_i;
+
+            src += 2;
+            dest += inc_dest;
+        }
+        return;
+    }
+
+    FLOAT temp_r0;
+    FLOAT temp_i0;
+    FLOAT temp_r1;
+    FLOAT temp_i1;
+    FLOAT temp_r2;
+    FLOAT temp_i2;
+    FLOAT temp_r3;
+    FLOAT temp_i3;
+    for (i = 0; i < n; i += 4) {
+#if !defined(XCONJ) 
+        temp_r0 = alpha_r * src[0] - alpha_i * src[1];
+        temp_i0 = alpha_r * src[1] + alpha_i * src[0];
+        temp_r1 = alpha_r * src[2] - alpha_i * src[3];
+        temp_i1 = alpha_r * src[3] + alpha_i * src[2];
+        temp_r2 = alpha_r * src[4] - alpha_i * src[5];
+        temp_i2 = alpha_r * src[5] + alpha_i * src[4];
+        temp_r3 = alpha_r * src[6] - alpha_i * src[7];
+        temp_i3 = alpha_r * src[7] + alpha_i * src[6];
+#else
+        temp_r0 = alpha_r * src[0] + alpha_i * src[1];
+        temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
+        temp_r1 = alpha_r * src[2] + alpha_i * src[3];
+        temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
+        temp_r2 = alpha_r * src[4] + alpha_i * src[5];
+        temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
+        temp_r3 = alpha_r * src[6] + alpha_i * src[7];
+        temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
+#endif
+
+        dest[0] += temp_r0;
+        dest[1] += temp_i0;
+        dest[2] += temp_r1;
+        dest[3] += temp_i1;
+        dest[4] += temp_r2;
+        dest[5] += temp_i2;
+        dest[6] += temp_r3;
+        dest[7] += temp_i3;
+
+        src += 8;
+        dest += 8;
+    }
+    return;
+}
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
+    BLASLONG i;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+    FLOAT xbuffer[16] __attribute__((aligned(16)));
+    FLOAT *ybuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    ybuffer = buffer;
+
+    inc_x *= 2;
+    inc_y *= 2;
+    lda *= 2;
+
+    n1 = n / 8;
+    n2 = n % 8;
+
+    m3 = m % 4;
+    m1 = m - (m % 4);
+    m2 = (m % NBMAX) - (m % 4);
+
+    y_ptr = y;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        a_ptr = a;
+
+        x_ptr = x;
+        //zero_y(NB,ybuffer);
+        memset(ybuffer, 0, NB * 16);
+
+        if (inc_x == 2) {
+
+            for (i = 0; i < n1; i++) {
+                zgemv_kernel_4x8(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 3;
+                x_ptr += 16;
+            }
+            if (n2 & 4) {
+                zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 2;
+                x_ptr += 8;
+            }
+
+            if (n2 & 2) {
+                zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
+                x_ptr += 4;
+                a_ptr += 2 * lda;
+
+            }
+
+            if (n2 & 1) {
+                zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+                x_ptr += 2;
+                a_ptr += lda;
+
+            }
+        } else {
+
+            for (i = 0; i < n1; i++) {
+
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[2] = x_ptr[0];
+                xbuffer[3] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[4] = x_ptr[0];
+                xbuffer[5] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[6] = x_ptr[0];
+                xbuffer[7] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[8] = x_ptr[0];
+                xbuffer[9] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[10] = x_ptr[0];
+                xbuffer[11] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[12] = x_ptr[0];
+                xbuffer[13] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[14] = x_ptr[0];
+                xbuffer[15] = x_ptr[1];
+                x_ptr += inc_x;
+                zgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer);
+
+                a_ptr += lda << 3;
+            }
+            for (i = 0; i < n2; i++) {
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+                a_ptr += lda;
+
+            }
+
+        }
+
+        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+        a += 2 * NB;
+        y_ptr += NB * inc_y;
+    }
+
+    if (m3 == 0) return (0);
+
+    if (m3 == 1) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r = 0.0;
+        FLOAT temp_i = 0.0;
+
+        if (lda == 2 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += 2;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+        return (0);
+    }
+
+    if (m3 == 2) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+
+        if (lda == 4 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+#endif
+
+                a_ptr += 8;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+        return (0);
+    }
+
+    if (m3 == 3) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+        FLOAT temp_r2 = 0.0;
+        FLOAT temp_i2 = 0.0;
+
+        if (lda == 6 && inc_x == 2) {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += 6;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+        return (0);
+    }
+
+    return (0);
+}
+
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index d82fab16a..314cf5e6e 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -129,6 +129,11 @@
 #define PREFETCHSIZE_C  16
 #endif
 
+#ifdef POWER3
+#define PREFETCHSIZE_A  34
+#define PREFETCHSIZE_C  16
+#endif
+
 #ifdef POWER4
 #define PREFETCHSIZE_A  34
 #define PREFETCHSIZE_C  16
diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index 956d75ffc..e42eafaba 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #elif HAVE_KERNEL_4x4_VEC
 
+#if defined(POWER10)
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+
+static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector_quad acc0, acc1, acc2, acc3;;
+    __vector_quad acc4, acc5, acc6, acc7;
+    v4sf_t result[4];
+    __vector_pair *Va0, *Va1, *Va2, *Va3;
+    i = 0;
+    n = n << 1;
+    __builtin_mma_xxsetaccz (&acc0);
+    __builtin_mma_xxsetaccz (&acc1);
+    __builtin_mma_xxsetaccz (&acc2);
+    __builtin_mma_xxsetaccz (&acc3);
+    __builtin_mma_xxsetaccz (&acc4);
+    __builtin_mma_xxsetaccz (&acc5);
+    __builtin_mma_xxsetaccz (&acc6);
+    __builtin_mma_xxsetaccz (&acc7);
+    while (i < n) {
+
+	vec_t *rx = (vec_t *) & x[i];
+        Va0  = ((__vector_pair*)((void*)&a0[i]));
+        Va1  = ((__vector_pair*)((void*)&a1[i]));
+        Va2  = ((__vector_pair*)((void*)&a2[i]));
+        Va3  = ((__vector_pair*)((void*)&a3[i]));
+
+        __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]);
+        __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]);
+        __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]);
+        __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]);
+        __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]);
+        i += 8;
+
+    }
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    __builtin_mma_disassemble_acc ((void *)result, &acc0);
+    register FLOAT temp_r0 = result[0][0] - result[1][1];
+    register FLOAT temp_i0 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc4);
+    temp_r0 += result[2][0] - result[3][1];
+    temp_i0 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc1);
+    register FLOAT temp_r1 = result[0][0] - result[1][1];
+    register FLOAT temp_i1 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc5);
+    temp_r1 += result[2][0] - result[3][1];
+    temp_i1 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc2);
+    register FLOAT temp_r2 = result[0][0] - result[1][1];
+    register FLOAT temp_i2 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc6);
+    temp_r2 += result[2][0] - result[3][1];
+    temp_i2 += result[2][1] + result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc3);
+    register FLOAT temp_r3 = result[0][0] - result[1][1];
+    register FLOAT temp_i3 = result[0][1] + result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc7);
+    temp_r3 += result[2][0] - result[3][1];
+    temp_i3 += result[2][1] + result[3][0];
+#else
+    __builtin_mma_disassemble_acc ((void *)result, &acc0);
+    register FLOAT temp_r0 = result[0][0] + result[1][1];
+    register FLOAT temp_i0 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc4);
+    temp_r0 += result[2][0] + result[3][1];
+    temp_i0 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc1);
+    register FLOAT temp_r1 = result[0][0] + result[1][1];
+    register FLOAT temp_i1 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc5);
+    temp_r1 += result[2][0] + result[3][1];
+    temp_i1 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc2);
+    register FLOAT temp_r2 = result[0][0] + result[1][1];
+    register FLOAT temp_i2 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc6);
+    temp_r2 += result[2][0] + result[3][1];
+    temp_i2 += result[2][1] - result[3][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc3);
+    register FLOAT temp_r3 = result[0][0] + result[1][1];
+    register FLOAT temp_i3 = result[0][1] - result[1][0];
+    __builtin_mma_disassemble_acc ((void *)result, &acc7);
+    temp_r3 += result[2][0] + result[3][1];
+    temp_i3 += result[2][1] - result[3][0];
+#endif
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+#endif
+}
+#else
 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
     BLASLONG i;
     FLOAT *a0, *a1, *a2, *a3;
@@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
 #endif
 }
 
+#endif
 #else
 
 static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 5526f4d67..0068138e8 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma GCC optimize "O1"
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #if defined(DOUBLE)
 #include "zscal_microk_power8.c"
 #endif
+#elif defined(POWER10)
+#if defined(DOUBLE)
+#include "zscal_microk_power10.c"
+#else
+#include "cscal_microk_power10.c"
+#endif
 #endif
 #endif
 
@@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 	{
 
 
+#if defined(DOUBLE)
 		n1 = n & -8;
+#else
+		n1 = n & -16;
+#endif
 		if ( n1 > 0 )
 		{
 			zscal_kernel_8(n1, x, da_r, da_i);
diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c
new file mode 100644
index 000000000..af99b8648
--- /dev/null
+++ b/kernel/power/zscal_microk_power10.c
@@ -0,0 +1,222 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
+{
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+  __vector double t4;
+  __vector double t5;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xsnegdp		33, %x10	\n\t"	// -alpha_i
+       XXSPLTD_S(32,%x9,0)	// alpha_r , alpha_r
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i
+#else
+       XXMRGHD_S(33,%x10, 33)	// -alpha_i , alpha_i
+#endif
+
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmuldp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
+       "xvmuldp		49, 41, 32	\n\t"
+       "xvmuldp		50, 42, 32	\n\t"
+       "xvmuldp		51, 43, 32	\n\t"
+       "xvmuldp		34, 44, 32	\n\t"
+       "xvmuldp		35, 45, 32	\n\t"
+       "xvmuldp		36, 46, 32	\n\t"
+       "xvmuldp		37, 47, 32	\n\t"
+
+       XXSWAPD_S(38,40)
+       XXSWAPD_S(39,41)
+       XXSWAPD_S(%x3,42)
+       XXSWAPD_S(%x4,43)
+       XXSWAPD_S(%x5,44)
+       XXSWAPD_S(%x6,45)
+       XXSWAPD_S(%x7,46)
+       XXSWAPD_S(%x8,47)
+
+       "xvmuldp		38, 38, 33	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+       "xvmuldp		39, 39, 33	\n\t"
+
+
+       "xvmuldp		%x3, %x3, 33	\n\t"
+       "xvmuldp		%x4, %x4, 33	\n\t"
+
+
+       "lxvp		40, 128(%2)	\n\t"
+       "lxvp		42, 160(%2)	\n\t"
+       "xvmuldp		%x5, %x5, 33	\n\t"
+       "xvmuldp		%x6, %x6, 33	\n\t"
+
+
+       "xvmuldp		%x7, %x7, 33	\n\t"
+       "xvmuldp		%x8, %x8, 33	\n\t"
+       "lxvp		44, 192(%2)	\n\t"
+       "lxvp		46, 224(%2)	\n\t"
+
+
+       "xvadddp		48, 48, 38	\n\t"
+       "xvadddp		49, 49, 39	\n\t"
+       "xvadddp		50, 50, %x3	\n\t"
+       "xvadddp		51, 51, %x4	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%2)   \n\t"
+       "stxv        49, 16(%2)  \n\t"
+       "stxv        50, 32(%2)  \n\t"
+       "stxv        51, 48(%2)  \n\t"
+#else
+       "stxv		49, 0(%2)	\n\t"
+       "stxv		48, 16(%2)	\n\t"
+       "stxv		51, 32(%2)	\n\t"
+       "stxv		50, 48(%2)	\n\t"
+#endif
+
+
+       "xvadddp		34, 34, %x5	\n\t"
+       "xvadddp		35, 35, %x6	\n\t"
+
+
+       "xvadddp		36, 36, %x7	\n\t"
+       "xvadddp		37, 37, %x8	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        34, 64(%2)  \n\t"
+       "stxv        35, 80(%2)  \n\t"
+       "stxv        36, 96(%2)  \n\t"
+       "stxv        37, 112(%2) \n\t"
+#else
+       "stxv		35, 64(%2)	\n\t"
+       "stxv		34, 80(%2)	\n\t"
+       "stxv		37, 96(%2)	\n\t"
+       "stxv		36, 112(%2)	\n\t"
+#endif
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmuldp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
+       "xvmuldp		49, 41, 32	\n\t"
+       "xvmuldp		50, 42, 32	\n\t"
+       "xvmuldp		51, 43, 32	\n\t"
+       "xvmuldp		34, 44, 32	\n\t"
+       "xvmuldp		35, 45, 32	\n\t"
+       "xvmuldp		36, 46, 32	\n\t"
+       "xvmuldp		37, 47, 32	\n\t"
+
+       XXSWAPD_S(38,40)
+       XXSWAPD_S(39,41)
+       XXSWAPD_S(%x3,42)
+       XXSWAPD_S(%x4,43)
+       XXSWAPD_S(%x5,44)
+       XXSWAPD_S(%x6,45)
+       XXSWAPD_S(%x7,46)
+       XXSWAPD_S(%x8,47)
+
+
+       "xvmuldp		38, 38, 33	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+       "xvmuldp		39, 39, 33	\n\t"
+       "xvmuldp		%x3, %x3, 33	\n\t"
+       "xvmuldp		%x4, %x4, 33	\n\t"
+       "xvmuldp		%x5, %x5, 33	\n\t"
+       "xvmuldp		%x6, %x6, 33	\n\t"
+       "xvmuldp		%x7, %x7, 33	\n\t"
+       "xvmuldp		%x8, %x8, 33	\n\t"
+
+       "xvadddp		48, 48, 38	\n\t"
+       "xvadddp		49, 49, 39	\n\t"
+
+       "xvadddp		50, 50, %x3	\n\t"
+       "xvadddp		51, 51, %x4	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        48, 0(%2)   \n\t"
+       "stxv        49, 16(%2)  \n\t"
+       "stxv        50, 32(%2)  \n\t"
+       "stxv        51, 48(%2)  \n\t"
+#else
+       "stxv		49, 0(%2)	\n\t"
+       "stxv		48, 16(%2)	\n\t"
+       "stxv		51, 32(%2)	\n\t"
+       "stxv		50, 48(%2)	\n\t"
+#endif
+       "xvadddp		34, 34, %x5	\n\t"
+       "xvadddp		35, 35, %x6	\n\t"
+
+
+       "xvadddp		36, 36, %x7	\n\t"
+       "xvadddp		37, 37, %x8	\n\t"
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+       "stxv        34, 64(%2)  \n\t"
+       "stxv        35, 80(%2)  \n\t"
+       "stxv        36, 96(%2)  \n\t"
+       "stxv        37, 112(%2) \n\t"
+#else
+       "stxv		35, 64(%2)	\n\t"
+       "stxv		34, 80(%2)	\n\t"
+       "stxv		37, 96(%2)	\n\t"
+       "stxv		36, 112(%2)	\n\t"
+#endif
+     "#n=%1 x=%0=%2 alpha=(%9,%10) \n"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3),	// 6
+       "=wa" (t4),	// 7
+       "=wa" (t5)	// 8
+     :
+       "d" (alpha_r),	// 9 
+       "d" (alpha_i)	// 10
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
index 3a5a8eb83..fe7871852 100644
--- a/kernel/power/zswap.c
+++ b/kernel/power/zswap.c
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
 #if defined(__VEC__) || defined(__ALTIVEC__)
+#if defined(POWER8) || defined(POWER9)
 #include "zswap_microk_power8.c"
+#elif defined(POWER10) 
+#include "cswap_microk_power10.c"
 #endif
 #endif
 
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
index ea6a8cf21..61a8a2b91 100644
--- a/kernel/riscv64/KERNEL.RISCV64_GENERIC
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -54,6 +54,7 @@ SDOTKERNEL   = ../riscv64/dot.c
 DDOTKERNEL   = ../riscv64/dot.c
 CDOTKERNEL   = ../riscv64/zdot.c
 ZDOTKERNEL   = ../riscv64/zdot.c
+DSDOTKERNEL  = ../generic/dot.c
 
 SNRM2KERNEL  = ../riscv64/nrm2.c
 DNRM2KERNEL  = ../riscv64/nrm2.c
diff --git a/kernel/riscv64/Makefile b/kernel/riscv64/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/riscv64/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index b6aec131e..5312f9ef0 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
+        FLOAT_V_T_M1 v_res, v_zero;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_zero = VFMVVF_FLOAT_M1(0, gvl);
 
         MASK_T mask0, mask1;
         FLOAT zero = 0.0;
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
                                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
                                 //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -83,6 +93,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -98,6 +109,7 @@ asm volatile(
                                 //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -106,6 +118,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -117,17 +130,17 @@ asm volatile(
                                 v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
                                 j += gvl*2;
                         }
-                        v0 = VFMVVF_FLOAT(0, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
-                        maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
+                        maxf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
                         //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -136,6 +149,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -144,14 +158,13 @@ asm volatile(
         :"v0");
 #endif
 
-                        v1 = VFMVVF_FLOAT(0, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] > maxf)
-                                maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
+                        if(v_res[0] > maxf)
+                                maxf = v_res[0];
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
@@ -162,6 +175,7 @@ asm volatile(
                                 //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -170,6 +184,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -185,6 +200,7 @@ asm volatile(
                                 //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -193,6 +209,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -205,17 +222,17 @@ asm volatile(
                                 j += gvl*2;
                                 ix += inc_xv*2;
                         }
-                        v0 = VFMVVF_FLOAT(0, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
-                        maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
+                        maxf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
                         //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -224,6 +241,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -232,10 +250,9 @@ asm volatile(
         :"v0");
 #endif
 
-                        v1 = VFMVVF_FLOAT(0, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] > maxf)
-                                maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
+                        if(v_res[0] > maxf)
+                                maxf = v_res[0];
                         j += gvl;
                 }
         }
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index 53243ad56..ae2867ef8 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMINVS_FLOAT vfredminvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMINVV_FLOAT vfmin_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMINVS_FLOAT vfredminvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMINVV_FLOAT vfmin_vv_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
+        FLOAT_V_T_M1 v_res, v_max;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
 
         MASK_T mask0, mask1;
-	FLOAT zero = 0.0;
+	    FLOAT zero = 0.0;
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
@@ -75,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -83,6 +92,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -97,6 +107,7 @@ asm volatile(
                                 //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -105,6 +116,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -116,17 +128,17 @@ asm volatile(
                                 v_min = VFMINVV_FLOAT(v_min, v1, gvl);
                                 j += gvl*2;
                         }
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
-                        minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        minf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
                         //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -135,6 +147,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -142,14 +155,13 @@ asm volatile(
         :"v"(mask0), "f"(zero), "r"(gvl)
         :"v0");
 #endif
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] < minf)
-                                minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
+                        if(v_res[0] < minf)
+                                minf = v_res[0];
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG idx = 0, inc_xv = inc_x * gvl;
@@ -160,6 +172,7 @@ asm volatile(
                                 //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -168,6 +181,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -182,6 +196,7 @@ asm volatile(
                                 //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -190,6 +205,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -202,17 +218,17 @@ asm volatile(
                                 j += gvl*2;
                                 idx += inc_xv*2;
                         }
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
-                        minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        minf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
                         //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
 #if defined(DOUBLE)
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e64,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -221,6 +237,7 @@ asm volatile(
         :"v0");
 #else
 asm volatile(
+        "vsetvli    zero, zero, e8, m1\n\t"
         "vor.vv     v0, %1, %1\n\t"
         "vsetvli    x0, %3, e32,m8 \n\t"
         "vfrsub.vf  %0, %0, %2, v0.t \n\t"
@@ -228,10 +245,9 @@ asm volatile(
         :"v"(mask0), "f"(zero), "r"(gvl)
         :"v0");
 #endif
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] < minf)
-                                minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
+                        if(v_res[0] < minf)
+                                minf = v_res[0];
                         j += gvl;
                 }
         }
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index 7ab7484e8..a623e9313 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFADDVV_FLOAT vfadd_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFADDVV_FLOAT vfadd_vv_f64m8
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_zero,v_sum;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         MASK_T mask0, mask1;
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
                                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
-                        asumf += v0[0];
+                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
+                        asumf += v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
-                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
-                        asumf += v0[0];
+                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
+                        asumf += v_res[0];
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
@@ -102,26 +110,26 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
                                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                                 inc_xv += inc_xv * 2;
                         }
-                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
-                        asumf += v0[0];
+                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
+                        asumf += v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
-                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
-                        asumf += v0[0];
+                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
+                        asumf += v_res[0];
                         j += gvl;
                 }
         }
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 432708db7..988c57ec2 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -28,27 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
@@ -65,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
         if(beta == 0.0){
                 if(alpha == 0.0){//alpha == 0 && beta == 0
                         if(inc_y == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 if(gvl <= n/2){
                                         vy0 = VFMVVF_FLOAT(0.0, gvl);
                                         for(i=0,j=0;i<n/(gvl*2);i++){
@@ -75,13 +73,13 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vy0 = VFMVVF_FLOAT(0.0, gvl);
                                         VSEV_FLOAT(&y[j], vy0, gvl);
                                         j += gvl;
                                 }
                         }else{
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_y = inc_y * sizeof(FLOAT);
                                 if(gvl <= n/2){
                                         vy0 = VFMVVF_FLOAT(0.0, gvl);
@@ -94,7 +92,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vy0 = VFMVVF_FLOAT(0.0, gvl);
                                         VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
                                         j += gvl;
@@ -103,7 +101,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 
                 }else{//alpha != 0 && beta == 0, y = ax
 			if(inc_x == 1 && inc_y == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 if(gvl <= n/2){
                                         for(i=0,j=0;i<n/(2*gvl);i++){
                                                 vx0 = VLEV_FLOAT(&x[j], gvl);
@@ -117,14 +115,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLEV_FLOAT(&x[j], gvl);
                                         vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         VSEV_FLOAT(&y[j], vy0, gvl);
                                         j += gvl;
                                 }
 			}else if(inc_y == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_x = inc_x * sizeof(FLOAT);
                                 if(gvl <= n/2){
                                         BLASLONG inc_xv = inc_x * gvl;
@@ -141,14 +139,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                                         vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         VSEV_FLOAT(&y[j], vy0, gvl);
                                         j += gvl;
                                 }
                         }else if(inc_x == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_y = inc_y * sizeof(FLOAT);
                                 if(gvl <= n/2){
                                         BLASLONG inc_yv = inc_y * gvl;
@@ -165,14 +163,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLEV_FLOAT(&x[j], gvl);
                                         vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
                                         j += gvl;
                                 }
                         }else{//inc_x !=1 && inc_y != 1
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_x = inc_x * sizeof(FLOAT);
                                 stride_y = inc_y * sizeof(FLOAT);
                                 if(gvl <= n/2){
@@ -192,7 +190,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                                         vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
@@ -203,7 +201,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
         }else{//beta != 0
 		if(alpha == 0.0){//alpha == 0 && beta != 0; y = by
 			if(inc_y == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 if(gvl <= n/2){
                                         for(i=0,j=0;i<n/(2*gvl);i++){
                                                 vy0 = VLEV_FLOAT(&y[j], gvl);
@@ -217,14 +215,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vy0 = VLEV_FLOAT(&y[j], gvl);
                                         vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
                                         VSEV_FLOAT(&y[j], vy0, gvl);
                                         j += gvl;
                                 }
 			}else{
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_y = inc_y * sizeof(FLOAT);
                                 if(gvl <= n/2){
                                         BLASLONG inc_yv = inc_y * gvl;
@@ -241,7 +239,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
                                         vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
                                         VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
@@ -251,7 +249,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 
 		}else{//alpha != 0 && beta != 0; y = ax + by
 			if(inc_x == 1 && inc_y == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 if(gvl <= n/2){
                                         for(i=0,j=0;i<n/(2*gvl);i++){
                                                 vx0 = VLEV_FLOAT(&x[j], gvl);
@@ -269,7 +267,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLEV_FLOAT(&x[j], gvl);
                                         vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         vy0 = VLEV_FLOAT(&y[j], gvl);
@@ -278,7 +276,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         j += gvl;
                                 }
 			}else if(inc_y == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_x = inc_x * sizeof(FLOAT);
                                 if(gvl <= n/2){
                                         BLASLONG inc_xv = inc_x * gvl;
@@ -299,7 +297,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                                         vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         vy0 = VLEV_FLOAT(&y[j], gvl);
@@ -308,7 +306,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         j += gvl;
                                 }
                         }else if(inc_x == 1){
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_y = inc_y * sizeof(FLOAT);
                                 if(gvl <= n/2){
                                         BLASLONG inc_yv = inc_y * gvl;
@@ -329,7 +327,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLEV_FLOAT(&x[j], gvl);
                                         vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -338,7 +336,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         j += gvl;
                                 }
                         }else{//inc_x != 1 && inc_y != 1
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 stride_x = inc_x * sizeof(FLOAT);
                                 stride_y = inc_y * sizeof(FLOAT);
                                 if(gvl <= n/2){
@@ -362,7 +360,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                                         vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
                                         vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 5a7ba4b08..98b9f6814 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -28,23 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -60,7 +58,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 
 	if (inc_x == 1 && inc_y == 1) {
 
-		gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+		gvl = VSETVL(n);
 
 		if (gvl <= n/2) {
 			for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) {
@@ -77,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		}
 		//tail
 		for (; j < n; ) {
-			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			gvl = VSETVL(n - j);
 			vx0 = VLEV_FLOAT(&x[j], gvl);
 			vy0 = VLEV_FLOAT(&y[j], gvl);
 			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
@@ -87,7 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		}
 	}else if (inc_y == 1) {
 		stride_x = inc_x * sizeof(FLOAT);
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(2*gvl); i++){
@@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                         }
                 }
 		for (; j<n; ) {
-			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			gvl = VSETVL(n - j);
 			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
 			vy0 = VLEV_FLOAT(&y[j], gvl);
 			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
@@ -115,7 +113,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		}
         }else if(inc_x == 1){
 		stride_y = inc_y * sizeof(FLOAT);
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         BLASLONG inc_yv = inc_y * gvl;
                         for(i=0,j=0; i<n/(2*gvl); i++){
@@ -134,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                         }
                 }
 		for (; j<n; ) {
-			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			gvl = VSETVL(n - j);
 			vx0 = VLEV_FLOAT(&x[j], gvl);
 			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
 			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
@@ -144,7 +142,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	}else{
 		stride_x = inc_x * sizeof(FLOAT);
 		stride_y = inc_y * sizeof(FLOAT);
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
                         BLASLONG inc_yv = inc_y * gvl;
@@ -165,7 +163,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                         }
                 }
 		for (; j<n; ) {
-			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			gvl = VSETVL(n - j);
 			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
 			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
 			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index ce44a20f3..b4ac110f1 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -26,21 +26,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VSEV_FLOAT vsev_float32xm8
-#define VSSEV_FLOAT vssev_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VSEV_FLOAT vse_v_f32m8
+#define VSSEV_FLOAT vsse_v_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VSEV_FLOAT vsev_float64xm8
-#define VSSEV_FLOAT vssev_float64xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VSEV_FLOAT vse_v_f64m8
+#define VSSEV_FLOAT vsse_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -56,7 +54,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
         if(inc_x == 1 && inc_y == 1){
                 memcpy(&y[0], &x[0], n*sizeof(FLOAT));
         }else if (inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/4){
                         BLASLONG inc_xv = inc_x * gvl;
@@ -77,13 +75,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         VSEV_FLOAT(&y[j], v0, gvl);
                         j += gvl;
                 }
         }else if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/4){
                         BLASLONG inc_yv = inc_y * gvl;
@@ -104,14 +102,14 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
                         j += gvl;
                 }
 
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/4){
@@ -136,7 +134,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
                         j += gvl;
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index 8b1ae278c..1d92699e0 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -27,25 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m4
 #endif
 
 #if defined(DSDOT)
@@ -61,8 +65,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
         FLOAT_V_T vr, vx, vy;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         if(inc_x == 1 && inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
@@ -71,23 +80,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        vx = VFMVVF_FLOAT(0, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
         }else if(inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
@@ -97,23 +105,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        vx = VFMVVF_FLOAT(0, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
         }else if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 unsigned int stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
@@ -123,23 +130,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        vx = VFMVVF_FLOAT(0, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int stride_y = inc_y * sizeof(FLOAT);
@@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        vx = VFMVVF_FLOAT(0, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
-                        dot += vx[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        dot += v_res[0];
                 }
         }
 	return(dot);
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index bd4d23eae..32ca8618b 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
         FLOAT_V_T va0, va1, vy0, vy1;
         unsigned int gvl = 0;
         if(inc_y == 1){
-                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(m);
                 if(gvl <= m/2){
                         for(k=0,j=0; k<m/(2*gvl); k++){
                                 a_ptr = a;
@@ -81,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                 }
                 //tail
                 for(;j < m;){
-                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(m-j);
                         a_ptr = a;
                         ix = 0;
                         vy0 = VLEV_FLOAT(&y[j], gvl);
@@ -98,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                 }
         }else{
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
-                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(m);
                 if(gvl <= m/2){
                         BLASLONG inc_yv = inc_y * gvl;
                         for(k=0,j=0; k<m/(2*gvl); k++){
@@ -124,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                 }
                 //tail
                 for(;j < m;){
-                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(m-j);
                         a_ptr = a;
                         ix = 0;
                         vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index beca8dc0f..ceba107cc 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -27,41 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFDOTVV_FLOAT vfdotvv_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m4
+#define VFMULVV_FLOAT vfmul_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFDOTVV_FLOAT vfdotvv_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VFMULVV_FLOAT vfmul_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
-	BLASLONG i = 0, j = 0, k = 0;
-	BLASLONG ix = 0, iy = 0;
-	FLOAT *a_ptr = a;
+	    BLASLONG i = 0, j = 0, k = 0;
+	    BLASLONG ix = 0, iy = 0;
+	    FLOAT *a_ptr = a;
         FLOAT temp;
 
         FLOAT_V_T va, vr, vx;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         if(inc_x == 1){
                 for(i = 0; i < n; i++){
-                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(m);
                         j = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
@@ -70,29 +79,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                                 vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
                                 j += gvl;
                         }
-                        va = VFMVVF_FLOAT(0, gvl);
-                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                        temp = va[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        temp = v_res[0];
                         if(j < m){
-                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
 
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp += va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp += v_res[0];
                         }
                         y[iy] += alpha * temp;
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }else{
-                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
                 for(i = 0; i < n; i++){
-                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(m);
                         j = 0;
                         ix = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
@@ -103,18 +109,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                                 j += gvl;
                                 ix += inc_xv;
                         }
-                        va = VFMVVF_FLOAT(0, gvl);
-                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                        temp = va[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        temp = v_res[0];
                         if(j < m){
-                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
 
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp += va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp += v_res[0];
                         }
                         y[iy] += alpha * temp;
                         iy += inc_y;
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index 3aa64afc9..056c0aaa8 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -31,49 +31,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 
 #define ABS fabs
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMAXVV_FLOAT vfmaxvv_float64xm8
-#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
-#define VMFIRSTM vmfirstm_e64xm8
-#define UINT_V_T uint64xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint64xm8
-#define VIDV_UINT vidv_uint64xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
-#define VADDVX_UINT vaddvx_uint64xm8
-#define VMVVX_UINT vmvvx_uint64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
+#define VMFIRSTM vmfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
 #else
 
 #define ABS fabsf
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMAXVV_FLOAT vfmaxvv_float32xm8
-#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
-#define VMFIRSTM vmfirstm_e32xm8
-#define UINT_V_T uint32xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint32xm8
-#define VIDV_UINT vidv_uint32xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
-#define VADDVX_UINT vaddvx_uint32xm8
-#define VMVVX_UINT vmvvx_uint32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
+#define VMFIRSTM vmfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
 #endif
 
 
@@ -88,42 +92,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 v_max_index = VMVVX_UINT(0, gvl);
                 v_max = VFMVVF_FLOAT(-1, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
-                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                vx = VFMVVF_FLOAT(0, gvl);
-                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                maxf = vx[0];
+                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+                maxf = v_res[0];
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
                 max_index = v_max_index[max_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
-                        vx = VFMVVF_FLOAT(0, gvl);
-                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                        FLOAT cur_maxf = vx[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+                        FLOAT cur_maxf = v_res[0];
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
@@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         }
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
@@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
-                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                vx = VFMVVF_FLOAT(0, gvl);
-                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                maxf = vx[0];
+                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+                maxf = v_res[0];
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
                 max_index = v_max_index[max_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
-                        vx = VFMVVF_FLOAT(0, gvl);
-                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                        FLOAT cur_maxf = vx[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+                        FLOAT cur_maxf = v_res[0];
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index 608f19a00..5bcffece5 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 
 #define ABS fabs
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMINVS_FLOAT vfredminvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMINVV_FLOAT vfminvv_float64xm8
-#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
-#define VMFIRSTM vmfirstm_e64xm8
-#define UINT_V_T uint64xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint64xm8
-#define VIDV_UINT vidv_uint64xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
-#define VADDVX_UINT vaddvx_uint64xm8
-#define VMVVX_UINT vmvvx_uint64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
+#define VMFIRSTM vmfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
 #else
 
 #define ABS fabsf
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMINVS_FLOAT vfredminvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMINVV_FLOAT vfminvv_float32xm8
-#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
-#define VMFIRSTM vmfirstm_e32xm8
-#define UINT_V_T uint32xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint32xm8
-#define VIDV_UINT vidv_uint32xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
-#define VADDVX_UINT vaddvx_uint32xm8
-#define VMVVX_UINT vmvvx_uint32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
+#define VMFIRSTM vmfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
 #endif
 
 
@@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_max;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         //index where element less than v_min
                         mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
-                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                minf = vx[0];
+                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                minf = v_res[0];
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
                 min_index = VMFIRSTM(mask,gvl);
                 min_index = v_min_index[min_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
-                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                        FLOAT cur_minf = vx[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        FLOAT cur_minf = v_res[0];
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
@@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         }
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
@@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         //index where element less than v_min
                         mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
-                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                minf = vx[0];
+                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                minf = v_res[0];
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
                 min_index = VMFIRSTM(mask,gvl);
                 min_index = v_min_index[min_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
-                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                        FLOAT cur_minf = vx[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        FLOAT cur_minf = v_res[0];
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index 44af7101b..42705f5de 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 
 #define ABS fabs
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFMAXVV_FLOAT vfmaxvv_float64xm8
-#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
-#define VMFIRSTM vmfirstm_e64xm8
-#define UINT_V_T uint64xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint64xm8
-#define VIDV_UINT vidv_uint64xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
-#define VADDVX_UINT vaddvx_uint64xm8
-#define VMVVX_UINT vmvvx_uint64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
+#define VMFIRSTM vmfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
 #else
 
 #define ABS fabsf
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFMAXVV_FLOAT vfmaxvv_float32xm8
-#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
-#define VMFIRSTM vmfirstm_e32xm8
-#define UINT_V_T uint32xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint32xm8
-#define VIDV_UINT vidv_uint32xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
-#define VADDVX_UINT vaddvx_uint32xm8
-#define VMVVX_UINT vmvvx_uint32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
+#define VMFIRSTM vmfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
 #endif
 
 
@@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_min;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 v_max_index = VMVVX_UINT(0, gvl);
                 v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
@@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
-                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                maxf = vx[0];
+                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
+                maxf = v_res[0];
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
                 max_index = v_max_index[max_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v_max = VLEV_FLOAT(&x[j], gvl);
 
-                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                        FLOAT cur_maxf = vx[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
+                        FLOAT cur_maxf = v_res[0];
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
@@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         }
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
@@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
-                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                maxf = vx[0];
+                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
+                maxf = v_res[0];
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
                 max_index = v_max_index[max_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
-                        FLOAT cur_maxf = vx[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
+                        FLOAT cur_maxf = v_res[0];
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c
index 598cba387..ffc65226e 100644
--- a/kernel/riscv64/imin.c
+++ b/kernel/riscv64/imin.c
@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
 	while(i < n)
 	{
-		if( x[ix] > minf )
+		if( x[ix] < minf )
 		{
 			min = i;
 			minf = x[ix];
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index e6e0e9f9f..3afa74dd6 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)
 
 #define ABS fabs
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMINVS_FLOAT vfredminvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFMINVV_FLOAT vfminvv_float64xm8
-#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
-#define VMFIRSTM vmfirstm_e64xm8
-#define UINT_V_T uint64xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint64xm8
-#define VIDV_UINT vidv_uint64xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
-#define VADDVX_UINT vaddvx_uint64xm8
-#define VMVVX_UINT vmvvx_uint64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
+#define VMFIRSTM vmfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
 #else
 
 #define ABS fabsf
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMINVS_FLOAT vfredminvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFMINVV_FLOAT vfminvv_float32xm8
-#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
-#define VMFIRSTM vmfirstm_e32xm8
-#define UINT_V_T uint32xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint32xm8
-#define VIDV_UINT vidv_uint32xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
-#define VADDVX_UINT vaddvx_uint32xm8
-#define VMVVX_UINT vmvvx_uint32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
+#define VMFIRSTM vmfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
 #endif
 
 
@@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_max;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         //index where element less than v_min
                         mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -113,26 +122,24 @@ asm volatile(
         :"v0");
 #endif
 */
-                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                minf = vx[0];
+                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                minf = v_res[0];
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
                 min_index = VMFIRSTM(mask,gvl);
                 min_index = v_min_index[min_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v_min = VLEV_FLOAT(&x[j], gvl);
 
-                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                        FLOAT cur_minf = vx[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        FLOAT cur_minf = v_res[0];
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
@@ -143,7 +150,7 @@ asm volatile(
                         }
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
@@ -154,7 +161,7 @@ asm volatile(
 
                         //index where element less than v_min
                         mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -175,27 +182,25 @@ asm volatile(
 #endif
 */
 
-                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                minf = vx[0];
+                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                minf = v_res[0];
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
                 min_index = VMFIRSTM(mask,gvl);
                 min_index = v_min_index[min_index];
 
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
-                        FLOAT cur_minf = vx[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        FLOAT cur_minf = v_res[0];
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index 62c95d973..ddb5eabde 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define RVV_EFLOAT RVV_E64
-#define FLOAT_V_T float64xm8_t
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMAXVV_FLOAT vfmaxvv_float64xm8
-#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
-#define VMFIRSTM vmfirstm_e64xm8
-#define UINT_V_T uint64xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint64xm8
-#define VIDV_UINT vidv_uint64xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
-#define VADDVX_UINT vaddvx_uint64xm8
-#define VFADDVV_FLOAT vfaddvv_float64xm8
-#define VMVVX_UINT vmvvx_uint64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
+#define VMFIRSTM vmfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VFADDVV_FLOAT vfadd_vv_f64m8
+#define VMVVX_UINT vmv_v_x_u64m8
 #else
 
 #define ABS fabsf
-#define RVV_EFLOAT RVV_E32
-#define FLOAT_V_T float32xm8_t
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMAXVV_FLOAT vfmaxvv_float32xm8
-#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
-#define VMFIRSTM vmfirstm_e32xm8
-#define UINT_V_T uint32xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint32xm8
-#define VIDV_UINT vidv_uint32xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
-#define VADDVX_UINT vaddvx_uint32xm8
-#define VFADDVV_FLOAT vfaddvv_float32xm8
-#define VMVVX_UINT vmvvx_uint32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
+#define VMFIRSTM vmfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VFADDVV_FLOAT vfadd_vv_f32m8
+#define VMVVX_UINT vmv_v_x_u32m8
 #endif
 
 #define RVV_M RVV_M8
@@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         UINT_V_T v_max_index;
         MASK_T mask0, mask1;
         unsigned int gvl = 0;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        gvl = VSETVL(n);
         v_max_index = VMVVX_UINT(0, gvl);
         v_max = VFMVVF_FLOAT(-1, gvl);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 //fabs(vector)
                 mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -119,7 +130,7 @@ asm volatile(
                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 //fabs(vector)
                 mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -143,7 +154,7 @@ asm volatile(
 
                 //index where element greater than v_max
                 mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
-                v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl);
+                v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -163,7 +174,7 @@ asm volatile(
         :"v0");
 #endif
 */
-                v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl);
+                v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
 
                 //update v_max and start_index j
                 v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
@@ -171,19 +182,19 @@ asm volatile(
                 ix += inc_xv;
         }
         vx0 = VFMVVF_FLOAT(0, gvl);
-        vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
-        maxf = vx0[0];
+        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+        maxf = v_res[0];
         mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
         max_index = VMFIRSTM(mask0,gvl);
         max_index = v_max_index[max_index];
 
         if(j < n){
-                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n-j);
                 v_max_index = VMVVX_UINT(0, gvl);
                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 //fabs(vector)
                 mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -206,7 +217,7 @@ asm volatile(
                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 //fabs(vector)
                 mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -227,9 +238,8 @@ asm volatile(
 #endif
 */
                 v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
-                vx0 = VFMVVF_FLOAT(0, gvl);
-                vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
-                FLOAT cur_maxf = vx0[0];
+                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+                FLOAT cur_maxf = v_res[0];
                 if(cur_maxf > maxf){
                         //tail index
                         v_max_index = VIDV_UINT(gvl);
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 38eccf1b5..6e328dc31 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define RVV_EFLOAT RVV_E64
-#define FLOAT_V_T float64xm8_t
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMINVS_FLOAT vfredminvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMINVV_FLOAT vfminvv_float64xm8
-#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
-#define VMFIRSTM vmfirstm_e64xm8
-#define UINT_V_T uint64xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint64xm8
-#define VIDV_UINT vidv_uint64xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
-#define VADDVX_UINT vaddvx_uint64xm8
-#define VFADDVV_FLOAT vfaddvv_float64xm8
-#define VMVVX_UINT vmvvx_uint64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
+#define VMFIRSTM vmfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VFADDVV_FLOAT vfadd_vv_f64m8
+#define VMVVX_UINT vmv_v_x_u64m8
 #else
 
 #define ABS fabsf
-#define RVV_EFLOAT RVV_E32
-#define FLOAT_V_T float32xm8_t
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMINVS_FLOAT vfredminvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMINVV_FLOAT vfminvv_float32xm8
-#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
-#define VMFIRSTM vmfirstm_e32xm8
-#define UINT_V_T uint32xm8_t
-#define VIDV_MASK_UINT vidv_mask_uint32xm8
-#define VIDV_UINT vidv_uint32xm8
-#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
-#define VADDVX_UINT vaddvx_uint32xm8
-#define VFADDVV_FLOAT vfaddvv_float32xm8
-#define VMVVX_UINT vmvvx_uint32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
+#define VMFIRSTM vmfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VFADDVV_FLOAT vfadd_vv_f32m8
+#define VMVVX_UINT vmv_v_x_u32m8
 #endif
 
-#define RVV_M RVV_M8
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         UINT_V_T v_min_index;
         MASK_T mask0, mask1;
         unsigned int gvl = 0;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        FLOAT_V_T_M1 v_res, v_max;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+
+        gvl = VSETVL(n);
         v_min_index = VMVVX_UINT(0, gvl);
         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 //fabs(vector)
                 mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -120,7 +130,7 @@ asm volatile(
                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 //fabs(vector)
                 mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -144,7 +154,7 @@ asm volatile(
 
                 //index where element less than v_min
                 mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
-                v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl);
+                v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -164,27 +174,26 @@ asm volatile(
         :"v0");
 #endif
 */
-                v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl);
+                v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
 
                 //update v_min and start_index j
                 v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
                 j += gvl;
                 ix += inc_xv;
         }
-        vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
-        vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
-        minf = vx0[0];
+        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+        minf = v_res[0];
         mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
         min_index = VMFIRSTM(mask0,gvl);
         min_index = v_min_index[min_index];
 
         if(j < n){
-                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n-j);
                 v_min_index = VMVVX_UINT(0, gvl);
                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 //fabs(vector)
                 mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -207,7 +216,7 @@ asm volatile(
                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 //fabs(vector)
                 mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
 /*
 #if defined(DOUBLE)
 asm volatile(
@@ -228,9 +237,8 @@ asm volatile(
 #endif
 */
                 v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
-                vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
-                FLOAT cur_minf = vx0[0];
+                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                FLOAT cur_minf = v_res[0];
                 if(cur_minf < minf){
                         //tail index
                         v_min_index = VIDV_UINT(gvl);
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index 4ef75452d..0fc59b74c 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 #include <float.h>
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=-FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
+        FLOAT_V_T_M1 v_res, v_min;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
 
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
@@ -68,21 +76,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
-                        maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
+                        maxf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] > maxf)
-                                maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
+                        if(v_res[0] > maxf)
+                                maxf = v_res[0];
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
@@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
-                        maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
+                        maxf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
-                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] > maxf)
-                                maxf = v0[0];
+                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
+                        if(v_res[0] > maxf)
+                                maxf = v_res[0];
                         j += gvl;
                 }
         }
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index 83c965bfa..8223fa87a 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 #include <float.h>
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMINVS_FLOAT vfredminvs_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMINVV_FLOAT vfmin_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMINVS_FLOAT vfredminvs_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMINVV_FLOAT vfmin_vv_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
+        FLOAT_V_T_M1 v_res, v_max;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
 
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
@@ -68,21 +76,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_min = VFMINVV_FLOAT(v_min, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
-                        minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        minf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] < minf)
-                                minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
+                        if(v_res[0] < minf)
+                                minf = v_res[0];
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
@@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
-                        minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+                        minf = v_res[0];
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
-                        if(v0[0] < minf)
-                                minf = v0[0];
+                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
+                        if(v_res[0] < minf)
+                                minf = v_res[0];
                         j += gvl;
                 }
         }
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 785c0d2f8..c461f410c 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m4
 #define ABS fabsf
-#define MASK_T e32xm4_t
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
-#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
-#define VMFIRSTM vmfirstm_e32xm4
-#define VFDIVVF_FLOAT vfdivvf_float32xm4
-#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#define MASK_T vbool8_t
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
+#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
+#define VMFIRSTM vmfirst_m_b8
+#define VFDIVVF_FLOAT vfdiv_vf_f32m4
+#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m4
 #define ABS fabs
-#define MASK_T e64xm4_t
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
-#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
-#define VMFIRSTM vmfirstm_e64xm4
-#define VFDIVVF_FLOAT vfdivvf_float64xm4
-#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#define MASK_T vbool16_t
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
+#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
+#define VMFIRSTM vmfirst_m_b16
+#define VFDIVVF_FLOAT vfdiv_vf_f64m4
+#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -73,18 +77,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
         FLOAT_V_T vr, v0, v_zero;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         FLOAT scale = 0.0, ssq = 0.0;
         MASK_T mask;
         BLASLONG index = 0;
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 for(i=0,j=0; i<n/gvl; i++){
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -95,15 +104,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //ssq in vector vr: vr[0]
-                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                                 //total ssq before current vector
-                                ssq += vr[0];
+                                ssq += v_res[0];
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 //ssq in vector vr
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
@@ -111,17 +120,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                 }
                 //ssq in vector vr: vr[0]
-                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                 //total ssq now
-                ssq += vr[0];
+                ssq += v_res[0];
 
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -130,21 +139,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                         }else{//found greater element
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                         }
                         vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         //ssq in vector vr: vr[0]
-                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                         //total ssq now
-                        ssq += vr[0];
+                        ssq += v_res[0];
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
@@ -153,7 +162,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -164,15 +173,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //ssq in vector vr: vr[0]
-                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                                 //total ssq before current vector
-                                ssq += vr[0];
+                                ssq += v_res[0];
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 //ssq in vector vr
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
@@ -181,17 +190,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         idx += inc_v;
                 }
                 //ssq in vector vr: vr[0]
-                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                 //total ssq now
-                ssq += vr[0];
+                ssq += v_res[0];
 
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -200,18 +209,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                         }else{//found greater element
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
                                 scale = vr[0];
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                         }
                         vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         //ssq in vector vr: vr[0]
-                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                         //total ssq now
-                        ssq += vr[0];
+                        ssq += v_res[0];
                 }
         }
 	return(scale * sqrt(ssq));
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
index a3d15406c..62dfe97d6 100644
--- a/kernel/riscv64/nrm2_vector_dot.c
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -27,26 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDSUM_FLOAT vfredsumvs_float32xm8
-#define VFMACCVV_FLOAT vfmaccvv_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFDOTVV_FLOAT vfdotvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDSUM_FLOAT vfredsum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m8
 #define ABS fabsf
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDSUM_FLOAT vfredsumvs_float64xm8
-#define VFMACCVV_FLOAT vfmaccvv_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFDOTVV_FLOAT vfdotvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDSUM_FLOAT vfredsum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m8
 #define ABS fabs
 #endif
 
@@ -60,8 +64,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
         FLOAT_V_T vr, v0, v1;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl < n/2){
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(2*gvl); i++){
@@ -73,25 +82,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
                                 j += gvl;
                         }
-                        v0 = VFMVVF_FLOAT(0, gvl);
-                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
-                        len += v0[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        len += v_res[0];
                 }
                 //tail
                 for(;j < n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         //v1 = 0
-                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //v1 = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(v0, v0, gvl);
                         vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
-                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
-                        len += v0[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        len += v_res[0];
 
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 if(gvl < n/2){
                         vr = VFMVVF_FLOAT(0, gvl);
@@ -104,20 +112,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
                                 j += gvl;
                         }
-                        v0 = VFMVVF_FLOAT(0, gvl);
-                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
-                        len += v0[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        len += v_res[0];
                 }
                 //tail
                 for(;j < n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         //v1 = 0
-                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //v1 = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(v0, v0, gvl);
                         vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
-                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
-                        len += v0[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        len += v_res[0];
 
                         j += gvl;
                 }
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index aeabca1ba..9b48d1c69 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMULVF_FLOAT vfmulvf_float32xm4
-#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFMSACVF_FLOAT vfmsac_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMULVF_FLOAT vfmulvf_float64xm4
-#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFMSACVF_FLOAT vfmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -61,7 +61,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         FLOAT_V_T v0, v1, vx, vy;
 
         if(inc_x == 1 && inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -77,7 +77,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         j += gvl;
                 }
                 if(j<n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
 
@@ -90,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -109,7 +109,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         ix += inc_xv;
                 }
                 if(j<n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
 
@@ -122,7 +122,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_yv = inc_y * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -141,7 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         iy += inc_yv;
                 }
                 if(j<n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
 
@@ -154,7 +154,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
@@ -176,7 +176,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         iy += inc_yv;
                 }
                 if(j<n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
 
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 5152eea06..e554fd040 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VSEV_FLOAT vsev_float32xm8
-#define VSSEV_FLOAT vssev_float32xm8
-#define VFMULVF_FLOAT vfmulvf_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VSEV_FLOAT vse_v_f32m8
+#define VSSEV_FLOAT vsse_v_f32m8
+#define VFMULVF_FLOAT vfmul_vf_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VSEV_FLOAT vsev_float64xm8
-#define VSSEV_FLOAT vssev_float64xm8
-#define VFMULVF_FLOAT vfmulvf_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VSEV_FLOAT vse_v_f64m8
+#define VSSEV_FLOAT vsse_v_f64m8
+#define VFMULVF_FLOAT vfmul_vf_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -61,7 +61,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                 if(da == 0.0){
                         memset(&x[0], 0, n * sizeof(FLOAT));
                 }else{
-                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n);
                         if(gvl <= n / 2){
                                 for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
                                         v0 = VLEV_FLOAT(&x[j], gvl);
@@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                         }
                         //tail
                         for(; j <n; ){
-                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n-j);
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 v0 = VFMULVF_FLOAT(v0, da, gvl);
                                 VSEV_FLOAT(&x[j], v0, gvl);
@@ -84,7 +84,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                 }
         }else{
                 if(da == 0.0){
-                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n);
                         if(gvl <= n / 2){
                                 v0 = VFMVVF_FLOAT(0, gvl);
                                 for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
@@ -94,13 +94,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                         }
                         //tail
                         for(; j <n; ){
-                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n-j);
                                 v0 = VFMVVF_FLOAT(0, gvl);
                                 VSEV_FLOAT(&x[j], v0, gvl);
                                 j += gvl;
                         }
                 }else{
-                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n);
                         BLASLONG stride_x = inc_x * sizeof(FLOAT);
                         BLASLONG ix = 0;
                         if(gvl < n / 2){
@@ -118,7 +118,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                         }
                         //tail
                         for(; j <n; ){
-                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n-j);
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 v0 = VFMULVF_FLOAT(v0, da, gvl);
                                 VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index 9377bf4b9..d9421e2f1 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <stdio.h>
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VSEV_FLOAT vsev_float32xm8
-#define VSSEV_FLOAT vssev_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VSEV_FLOAT vse_v_f32m8
+#define VSSEV_FLOAT vsse_v_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VSEV_FLOAT vsev_float64xm8
-#define VSSEV_FLOAT vssev_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VSEV_FLOAT vse_v_f64m8
+#define VSSEV_FLOAT vsse_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 
 	if (n < 0)  return(0);
         if(inc_x == 1 && inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         for(i=0,j=0; i<n/(2*gvl); i++){
                                 vx0 = VLEV_FLOAT(&x[j], gvl);
@@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx0 = VLEV_FLOAT(&x[j], gvl);
                         vy0 = VLEV_FLOAT(&y[j], gvl);
                         VSEV_FLOAT(&x[j], vy0, gvl);
@@ -79,7 +79,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         j+=gvl;
                 }
         }else if (inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
@@ -98,7 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vy0 = VLEV_FLOAT(&y[j], gvl);
                         VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
@@ -107,7 +107,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         ix += inc_x * gvl;
                 }
         }else if(inc_x == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_yv = inc_y * gvl;
@@ -126,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx0 = VLEV_FLOAT(&x[j], gvl);
                         vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                         VSEV_FLOAT(&x[j], vy0, gvl);
@@ -135,7 +135,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         iy += inc_y * gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
@@ -157,7 +157,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                         VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 3c2647026..de8917532 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMULVV_FLOAT vfmul_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMULVV_FLOAT vfmul_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -63,6 +67,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
         FLOAT temp2;
         FLOAT *a_ptr = a;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         FLOAT_V_T va, vx, vy, vr;
         BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
@@ -76,7 +84,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         i = j + 1;
                         len = m - i;
                         if(len > 0){
-                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(len);
                                 vr = VFMVVF_FLOAT(0, gvl);
                                 for(k = 0; k < len / gvl; k++){
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
@@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < m){
-                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         i = j + 1;
                         len = m - i;
                         if(len > 0){
-                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(len);
                                 inc_yv = inc_y * gvl;
                                 vr = VFMVVF_FLOAT(0, gvl);
                                 for(k = 0; k < len / gvl; k++){
@@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < m){
-                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
 			}
                         y[jy] += alpha * temp2;
@@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         i = j + 1;
                         len = m - i;
                         if(len > 0){
-                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(len);
                                 vr = VFMVVF_FLOAT(0, gvl);
                                 inc_xv = inc_x * gvl;
                                 for(k = 0; k < len / gvl; k++){
@@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < m){
-                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         i = j + 1;
                         len = m - i;
                         if(len > 0){
-                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(len);
                                 inc_xv = inc_x * gvl;
                                 inc_yv = inc_y * gvl;
                                 vr = VFMVVF_FLOAT(0, gvl);
@@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < m){
-                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
 			}
                         y[jy] += alpha * temp2;
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 29e0e4b65..7229a48b1 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFDOTVV_FLOAT vfdotvv_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m4
+#define VFMULVV_FLOAT vfmul_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFDOTVV_FLOAT vfdotvv_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VFMULVV_FLOAT vfmul_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
         FLOAT temp2;
         FLOAT *a_ptr = a;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         FLOAT_V_T va, vx, vy, vr;
         BLASLONG stride_x, stride_y, inc_xv, inc_yv;
@@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         temp2 = 0.0;
                         if(j > 0){
                                 i = 0;
-                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(j);
                                 vr = VFMVVF_FLOAT(0, gvl);
                                 for(k = 0; k < j / gvl; k++){
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < j){
-                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         if(j > 0){
                                 iy = 0;
                                 i = 0;
-                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(j);
                                 inc_yv = inc_y * gvl;
                                 vr = VFMVVF_FLOAT(0, gvl);
                                 for(k = 0; k < j / gvl; k++){
@@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < j){
-                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
@@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         if(j > 0){
                                 ix = 0;
                                 i = 0;
-                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(j);
                                 inc_xv = inc_x * gvl;
                                 vr = VFMVVF_FLOAT(0, gvl);
                                 for(k = 0; k < j / gvl; k++){
@@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < j){
-                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 ix = 0;
                                 iy = 0;
                                 i = 0;
-                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(j);
                                 inc_xv = inc_x * gvl;
                                 inc_yv = inc_y * gvl;
                                 vr = VFMVVF_FLOAT(0, gvl);
@@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                va = VFMVVF_FLOAT(0, gvl);
-                                va = VFREDSUM_FLOAT(vr, va, gvl);
-                                temp2 = va[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                temp2 = v_res[0];
                                 if(i < j){
-                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                                         va = VLEV_FLOAT(&a_ptr[i], gvl);
                                         vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
@@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        va = VFMVVF_FLOAT(0, gvl);
-                                        va = VFREDSUM_FLOAT(vr, va, gvl);
-                                        temp2 += va[0];
+                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                                        temp2 += v_res[0];
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index a6c742b14..5cd65b225 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMAXVV_FLOAT vfmaxvv_float32xm8
-#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VFADDVV_FLOAT vfadd_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMAXVV_FLOAT vfmaxvv_float64xm8
-#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFADDVV_FLOAT vfadd_vv_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(n);
         v_max = VFMVVF_FLOAT(0, gvl);
         BLASLONG inc_xv = inc_x * gvl * 2;
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
@@ -82,23 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v0 = VFMVVF_FLOAT(0, gvl);
-        v_max = VFREDMAXVS_FLOAT(v_max, v0, gvl);
-        maxf = v_max[0];
+        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
+        maxf = v_res[0];
 
         if(j<n){
-                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v0 = VFMVVF_FLOAT(0, gvl);
-                v_max = VFREDMAXVS_FLOAT(v1, v0, gvl);
-                if(v_max[0] > maxf)
-                        maxf = v_max[0];
+                v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
+                if(v_res[0] > maxf)
+                        maxf = v_res[0];
         }
         return(maxf);
 }
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index 44a7cf1dc..9d567b3da 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDMINVS_FLOAT vfredminvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFMINVV_FLOAT vfminvv_float32xm8
-#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VFADDVV_FLOAT vfadd_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDMINVS_FLOAT vfredminvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFMINVV_FLOAT vfminvv_float64xm8
-#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFADDVV_FLOAT vfadd_vv_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
+        FLOAT_V_T_M1 v_res, v_max;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(n);
         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
         BLASLONG inc_xv = inc_x * gvl * 2;
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_min = VFMINVV_FLOAT(v_min, v0, gvl);
@@ -82,23 +91,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
-        v_min = VFREDMINVS_FLOAT(v_min, v0, gvl);
-        minf = v_min[0];
+        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
+        minf = v_res[0];
 
         if(j<n){
-                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
-                v_min = VFREDMINVS_FLOAT(v1, v0, gvl);
-                if(v_min[0] < minf)
-                        minf = v_min[0];
+                v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
+                if(v_res[0] < minf)
+                        minf = v_res[0];
         }
         return(minf);
 }
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index d9fa88971..12527f9dc 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
-#define MASK_T e32xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
-#define VFMVVF_FLOAT vfmvvf_float32xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
-#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
+#define VFADDVV_FLOAT vfadd_vv_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
-#define MASK_T e64xm8_t
-#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
-#define VFMVVF_FLOAT vfmvvf_float64xm8
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
-#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
+#define VFADDVV_FLOAT vfadd_vv_f64m8
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_zero,v_sum;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         MASK_T mask0, mask1;
         if(inc_x == 1){
                 BLASLONG n2 = n * 2;
-                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n2);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n2/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n2/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
                                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
-                        asumf += v0[0];
+                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
+                        asumf += v_res[0];
                 }
                 for(;j<n2;){
-                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n2-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
-                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
-                        asumf += v0[0];
+                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
+                        asumf += v_res[0];
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
                 v_zero = VFMVVF_FLOAT(0, gvl);
 
@@ -103,31 +111,31 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 for(i=0,j=0; i<n/gvl; i++){
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                         mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
 
                         j += gvl;
                         ix += inc_xv;
                 }
-                v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
-                asumf += v0[0];
+                v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
+                asumf += v_res[0];
                 if(j<n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 
                         mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
                         v_sum = VFADDVV_FLOAT(v0, v1, gvl);
-                        v_sum = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
-                        asumf += v_sum[0];
+                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
+                        asumf += v_res[0];
                 }
         }
 	return(asumf);
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 1897ce417..3eca20415 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -28,27 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFMULVF_FLOAT vfmulvf_float32xm4
-#define VFMSACVF_FLOAT vfmsacvf_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFMSACVF_FLOAT vfmsac_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFMULVF_FLOAT vfmulvf_float64xm4
-#define VFMSACVF_FLOAT vfmsacvf_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFMSACVF_FLOAT vfmsac_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
@@ -69,7 +67,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                         if(inc_y == 1){
                                 memset(&y[0], 0, 2 * n * sizeof(FLOAT));
                         }else{
-                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n);
                                 if(gvl <= n/2){
                                         vy0 = VFMVVF_FLOAT(0.0, gvl);
                                         BLASLONG inc_yv = inc_y * gvl * 2;
@@ -83,7 +81,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                                         }
                                 }
                                 for(;j<n;){
-                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        gvl = VSETVL(n-j);
                                         vy0 = VFMVVF_FLOAT(0.0, gvl);
                                         VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
                                         VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
@@ -92,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                                 }
                         }
 		}else{
-                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n);
                         BLASLONG inc_xv = inc_x * gvl * 2;
                         BLASLONG inc_yv = inc_y * gvl * 2;
                         for(i=0,j=0; i<n/gvl; i++){
@@ -110,7 +108,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                                 iy += inc_yv;
                         }
                         if(j<n){
-                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n-j);
                                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                                 vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
@@ -124,7 +122,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
         }else{
 	        FLOAT_V_T v0, v1;
                 if(alpha_r == 0.0 && alpha_i == 0.0){
-                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n);
                         BLASLONG inc_yv = inc_y * gvl * 2;
                         for(i=0,j=0;i<n/gvl;i++){
                                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -139,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                                 iy += inc_yv;
                         }
                         if(j<n){
-                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n-j);
                                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
                                 vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
                                 v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
@@ -150,7 +148,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                                 VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
                         }
 		}else{
-                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n);
                         BLASLONG inc_xv = inc_x * gvl * 2;
                         BLASLONG inc_yv = inc_y * gvl * 2;
                         for(i=0,j=0; i<n/gvl; i++){
@@ -174,7 +172,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                                 iy += inc_yv;
                         }
                         if(j<n){
-                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                gvl = VSETVL(n-j);
                                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index fb2656a1d..303d3541e 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -28,21 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -56,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
 
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(n);
         BLASLONG inc_xv = inc_x * 2 * gvl;
         BLASLONG inc_yv = inc_y * 2 * gvl;
         for(i=0,j=0; i < n/gvl; i++){
@@ -82,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
                 iy += inc_yv;
         }
         if(j < n){
-                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n-j);
                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 6ed430931..600f02bba 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -27,17 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
 #endif
 
 
@@ -52,7 +50,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT));
         }else{
                 FLOAT_V_T vx0, vx1, vx2, vx3;
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
                 if(gvl <= n/2){
@@ -75,7 +73,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         }
                 }
                 for(;j<n;){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                         VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index 33efd07e7..1928a897c 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFDOTVV_FLOAT vfdotvv_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
-#define VFMSACVV_FLOAT vfmsacvv_float32xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m4
+#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFMSACVV_FLOAT vfmsac_vv_f32m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFDOTVV_FLOAT vfdotvv_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
-#define VFMSACVV_FLOAT vfmsacvv_float64xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFMSACVV_FLOAT vfmsac_vv_f64m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
 #endif
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -70,9 +74,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
         if ( n < 1 )  return(result);
 
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(n);
         vr0 = VFMVVF_FLOAT(0, gvl);
         vr1 = VFMVVF_FLOAT(0, gvl);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -99,14 +107,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 ix += inc_xv;
                 iy += inc_yv;
         }
-        vx0 = VFMVVF_FLOAT(0, gvl);
-        vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
-        dot[0] += vr0[0];
-        vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
-        dot[1] += vr1[0];
+        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+        dot[0] += v_res[0];
+        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+        dot[1] += v_res[0];
         //tail
         if(j < n){
-                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n-j);
                 vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -123,11 +130,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
                 vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
 #endif
-                vx0 = VFMVVF_FLOAT(0, gvl);
-                vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
-                dot[0] += vr0[0];
-                vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
-                dot[1] += vr1[0];
+                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                dot[0] += v_res[0];
+                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                dot[1] += v_res[0];
         }
         CREAL(result) = dot[0];
         CIMAG(result) = dot[1];
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index 31cbbe6bb..b5ee1f054 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -27,25 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -58,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         unsigned int gvl = 0;
         BLASLONG stride_a = sizeof(FLOAT) * 2;
         BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
-        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(m);
         BLASLONG inc_yv = inc_y * gvl * 2;
         BLASLONG inc_x2 = inc_x * 2;
         BLASLONG lda2 = lda * 2;
@@ -117,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         }
         //tail
         if(j/2 < m){
-                gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(m-j/2);
                 a_ptr = a;
                 ix = 0;
                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index b23a4d8a3..214861eca 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -27,25 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMULVV_FLOAT vfmul_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMULVV_FLOAT vfmul_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -57,15 +61,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 
         FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
         BLASLONG stride_a = sizeof(FLOAT) * 2;
-        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(m);
         BLASLONG inc_xv = inc_x * gvl * 2;
         BLASLONG inc_av = gvl * 2;
         BLASLONG inc_y2 = inc_y * 2;
         BLASLONG lda2 = lda * 2;
         for(i = 0; i < n; i++){
-                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(m);
                 j = 0;
                 ix = 0;
                 vr = VFMVVF_FLOAT(0, gvl);
@@ -90,13 +99,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         j += inc_av;
                         ix += inc_xv;
                 }
-                va0 = VFMVVF_FLOAT(0, gvl);
-                vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
-                temp_r = vx0[0];
-                vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
-                temp_i = vx1[0];
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                temp_r = v_res[0];
+                v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
+                temp_i = v_res[0];
                 if(j/2 < m){
-                        gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(m-j/2);
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
                         va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
@@ -113,11 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
 
 #endif
-                        va0 = VFMVVF_FLOAT(0, gvl);
-                        vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
-                        temp_r += vx0[0];
-                        vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
-                        temp_i += vx1[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                        temp_r += v_res[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
+                        temp_i += v_res[0];
                 }
 #if !defined(XCONJ)
                 y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index aa9ac85d5..c8c2178d0 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
         FLOAT temp_r2, temp_i2;
         FLOAT *a_ptr = a;
         unsigned int gvl = 0;
-
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
         BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
@@ -90,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                 i = j + 1;
                 len = m - i;
                 if(len > 0){
-                        gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(len);
                         inc_xv = incx * gvl * 2;
                         inc_yv = incy * gvl * 2;
                         inc_av = gvl * 2;
@@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        va0 = VFMVVF_FLOAT(0, gvl);
-                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
-                        temp_r2 = vx0[0];
-                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
-                        temp_i2 = vx1[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        temp_r2 = v_res[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        temp_i2 = v_res[0];
                         if(i < m){
-				gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+				                gvl = VSETVL(m-i);
                                 va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
                                 va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
                                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                va0 = VFMVVF_FLOAT(0, gvl);
-                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
-                                temp_r2 += vx0[0];
-                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
-                                temp_i2 += vx1[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                temp_r2 += v_res[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                temp_i2 += v_res[0];
                         }
                 }
 		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 6fe12c76c..40cd9cd64 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFMULVV_FLOAT vfmulvv_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFMULVV_FLOAT vfmulvv_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
-#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
         FLOAT temp_r2, temp_i2;
         FLOAT *a_ptr = a;
         unsigned int gvl = 0;
-
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
         FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
         BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
@@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                 ia = 0;
                 i = 0;
                 if(j > 0){
-                        gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(j);
                         inc_xv = incx * gvl * 2;
                         inc_yv = incy * gvl * 2;
                         inc_av = gvl * 2;
@@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        va0 = VFMVVF_FLOAT(0, gvl);
-                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
-                        temp_r2 = vx0[0];
-                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
-                        temp_i2 = vx1[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        temp_r2 = v_res[0];
+                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        temp_i2 = v_res[0];
                         if(i < j){
-				gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+				                gvl = VSETVL(j-i);
                                 va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
                                 va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
                                 vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                va0 = VFMVVF_FLOAT(0, gvl);
-                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
-                                temp_r2 += vx0[0];
-                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
-                                temp_i2 += vx1[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                temp_r2 += v_res[0];
+                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                temp_i2 += v_res[0];
                         }
                 }
                 y[jy] += temp_r1 * a_ptr[ja];
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index b0ebfa5f4..5ac62eb80 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VFREDSUM_FLOAT vfredsumvs_float32xm4
-#define VFMACCVV_FLOAT vfmaccvv_float32xm4
-#define VFMVVF_FLOAT vfmvvf_float32xm4
-#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT vfdot_vv_f32m4
 #define ABS fabsf
-#define MASK_T e32xm4_t
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
-#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
-#define VMFIRSTM vmfirstm_e32xm4
-#define VFDIVVF_FLOAT vfdivvf_float32xm4
-#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
-#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#define MASK_T vbool8_t
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
+#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
+#define VMFIRSTM vmfirst_m_b8
+#define VFDIVVF_FLOAT vfdiv_vf_f32m4
+#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VFREDSUM_FLOAT vfredsumvs_float64xm4
-#define VFMACCVV_FLOAT vfmaccvv_float64xm4
-#define VFMVVF_FLOAT vfmvvf_float64xm4
-#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT vfdot_vv_f64m4
 #define ABS fabs
-#define MASK_T e64xm4_t
-#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
-#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
-#define VMFIRSTM vmfirstm_e64xm4
-#define VFDIVVF_FLOAT vfdivvf_float64xm4
-#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
-#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#define MASK_T vbool16_t
+#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
+#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
+#define VMFIRSTM vmfirst_m_b16
+#define VFDIVVF_FLOAT vfdiv_vf_f64m4
+#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
         FLOAT_V_T vr, v0, v_zero;
         unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
         FLOAT scale = 0.0, ssq = 0.0;
         MASK_T mask;
         BLASLONG index = 0;
         if(inc_x == 1){
                 BLASLONG n2 = n * 2;
-                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n2);
                 vr = VFMVVF_FLOAT(0, gvl);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 for(i=0,j=0; i<n2/gvl; i++){
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -96,15 +105,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //ssq in vector vr: vr[0]
-                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                                 //total ssq before current vector
-                                ssq += vr[0];
+                                ssq += v_res[0];
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 //ssq in vector vr
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
@@ -112,17 +121,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                 }
                 //ssq in vector vr: vr[0]
-                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                 //total ssq now
-                ssq += vr[0];
+                ssq += v_res[0];
 
                 //tail
                 if(j < n2){
-                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n2-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -131,21 +140,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                         v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                         }else{//found greater element
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                         }
                         vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         //ssq in vector vr: vr[0]
-                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                         //total ssq now
-                        ssq += vr[0];
+                        ssq += v_res[0];
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
                 v_zero = VFMVVF_FLOAT(0, gvl);
                 unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -154,7 +163,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -165,15 +174,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //ssq in vector vr: vr[0]
-                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                                 //total ssq before current vector
-                                ssq += vr[0];
+                                ssq += v_res[0];
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 //ssq in vector vr
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
@@ -182,7 +191,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -193,15 +202,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //ssq in vector vr: vr[0]
-                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                                 //total ssq before current vector
-                                ssq += vr[0];
+                                ssq += v_res[0];
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 //ssq in vector vr
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
@@ -210,17 +219,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         idx += inc_v;
                 }
                 //ssq in vector vr: vr[0]
-                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                 //total ssq now
-                ssq += vr[0];
+                ssq += v_res[0];
 
                 //tail
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -231,11 +240,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
@@ -243,7 +252,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
                         //fabs(vector)
                         mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
                         //if scale change
                         mask = VMFGTVF_FLOAT(v0, scale, gvl);
                         index = VMFIRSTM(mask, gvl);
@@ -254,22 +263,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 }
                         }else{//found greater element
                                 //ssq in vector vr: vr[0]
-                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                                 //total ssq before current vector
-                                ssq += vr[0];
+                                ssq += v_res[0];
                                 //find max
-                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
                                 //update ssq before max_index
-                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
                                 //update scale
-                                scale = vr[0];
+                                scale = v_res[0];
                                 v0 = VFDIVVF_FLOAT(v0, scale, gvl);
                                 vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
                         //ssq in vector vr: vr[0]
-                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
                         //total ssq now
-                        ssq += vr[0];
+                        ssq += v_res[0];
                 }
         }
 	return(scale * sqrt(ssq));
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index a3fdda45a..2fdd8135a 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLEV_FLOAT vlev_float32xm4
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSEV_FLOAT vsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMULVF_FLOAT vfmulvf_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle_v_f32m4
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSEV_FLOAT vse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLEV_FLOAT vlev_float64xm4
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSEV_FLOAT vsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMULVF_FLOAT vfmulvf_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle_v_f64m4
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSEV_FLOAT vse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         unsigned int gvl = 0;
 
         FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
-        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        gvl = VSETVL(n);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
         BLASLONG inc_xv = inc_x * 2 * gvl;
@@ -90,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 			ix += 2*gvl;
 		}
 		if(j < n){
-			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+			gvl = VSETVL(n-j);
 						vx0 = VLEV_FLOAT(&x[ix], gvl);
 			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
 			vy0 = VLEV_FLOAT(&y[ix], gvl);
@@ -137,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 			iy += inc_yv;
 		}
 		if(j < n){
-			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+			gvl = VSETVL(n-j);
 			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
 			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 796c52a02..64323aa3a 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -27,23 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M4
-#define FLOAT_V_T float32xm4_t
-#define VLSEV_FLOAT vlsev_float32xm4
-#define VSSEV_FLOAT vssev_float32xm4
-#define VFMACCVF_FLOAT vfmaccvf_float32xm4
-#define VFMULVF_FLOAT vfmulvf_float32xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEV_FLOAT vlse_v_f32m4
+#define VSSEV_FLOAT vsse_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M4
-#define FLOAT_V_T float64xm4_t
-#define VLSEV_FLOAT vlsev_float64xm4
-#define VSSEV_FLOAT vssev_float64xm4
-#define VFMACCVF_FLOAT vfmaccvf_float64xm4
-#define VFMULVF_FLOAT vfmulvf_float64xm4
-#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEV_FLOAT vlse_v_f64m4
+#define VSSEV_FLOAT vsse_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -58,9 +60,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
         if(da_r == 0.0 && da_i == 0.0){
-                memset(&x[0], 0, n * 2 * sizeof(FLOAT));
+                gvl = VSETVL(n);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                vt = VFMVVF_FLOAT(0.0, gvl);
+                for(i=0,j=0; i < n/(gvl*2); i++){
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);
+
+                        j += gvl*2;
+                        ix += inc_xv*2;
+                }
+                for(; j < n; ){
+                        gvl = VSETVL(n-j);
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
+                        j += gvl;
+                        ix += inc_x * 2 * gvl;
+                }
         }else if(da_r == 0.0){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;
                 for(i=0,j=0; i < n/gvl; i++){
@@ -77,7 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         ix += inc_xv;
                 }
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 
@@ -88,7 +109,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
                 }
         }else if(da_i == 0.0){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;
                 for(i=0,j=0; i < n/gvl; i++){
@@ -105,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         ix += inc_xv;
                 }
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 
@@ -116,7 +137,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;
                 for(i=0,j=0; i < n/gvl; i++){
@@ -135,7 +156,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         ix += inc_xv;
                 }
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index b655a968c..7550294b5 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <stdio.h>
 #if !defined(DOUBLE)
-#define RVV_EFLOAT RVV_E32
-#define RVV_M RVV_M8
-#define FLOAT_V_T float32xm8_t
-#define VLEV_FLOAT vlev_float32xm8
-#define VLSEV_FLOAT vlsev_float32xm8
-#define VSEV_FLOAT vsev_float32xm8
-#define VSSEV_FLOAT vssev_float32xm8
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle_v_f32m8
+#define VLSEV_FLOAT vlse_v_f32m8
+#define VSEV_FLOAT vse_v_f32m8
+#define VSSEV_FLOAT vsse_v_f32m8
 #else
-#define RVV_EFLOAT RVV_E64
-#define RVV_M RVV_M8
-#define FLOAT_V_T float64xm8_t
-#define VLEV_FLOAT vlev_float64xm8
-#define VLSEV_FLOAT vlsev_float64xm8
-#define VSEV_FLOAT vsev_float64xm8
-#define VSSEV_FLOAT vssev_float64xm8
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle_v_f64m8
+#define VLSEV_FLOAT vlse_v_f64m8
+#define VSEV_FLOAT vse_v_f64m8
+#define VSSEV_FLOAT vsse_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
 
 	if (n < 0)  return(0);
         if(inc_x == 1 && inc_y == 1){
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 BLASLONG n2 = n * 2;
                 if(gvl <= n2/2){
                         for(i=0,j=0; i<n2/(2*gvl); i++){
@@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                         }
                 }
                 for(;j<n2;){
-                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n2-j);
                         vx0 = VLEV_FLOAT(&x[j], gvl);
                         vy0 = VLEV_FLOAT(&y[j], gvl);
                         VSEV_FLOAT(&x[j], vy0, gvl);
@@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                         j += gvl;
                 }
         }else{
-                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                gvl = VSETVL(n);
                 stride_x = inc_x * 2 * sizeof(FLOAT);
                 stride_y = inc_y * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl * 2;
@@ -100,7 +100,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                         iy += inc_yv;
                 }
                 if(j < n){
-                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        gvl = VSETVL(n-j);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                         vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index d0317a745..a81b32ddc 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -112,6 +112,11 @@ gotoblas_t TABLE_NAME = {
 #else
   NULL,NULL,
 #endif
+#ifdef SMALL_MATRIX_OPT
+  sbgemm_small_matrix_permitTS,
+  sbgemm_small_kernel_nnTS, sbgemm_small_kernel_ntTS, sbgemm_small_kernel_tnTS, sbgemm_small_kernel_ttTS,
+  sbgemm_small_kernel_b0_nnTS, sbgemm_small_kernel_b0_ntTS, sbgemm_small_kernel_b0_tnTS, sbgemm_small_kernel_b0_ttTS,
+#endif
 #endif
 
 #if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
@@ -171,6 +176,14 @@ gotoblas_t TABLE_NAME = {
   sgemm_oncopyTS, sgemm_otcopyTS,
 #endif
 
+#if BUILD_SINGLE == 1
+#ifdef SMALL_MATRIX_OPT
+  sgemm_small_matrix_permitTS,
+  sgemm_small_kernel_nnTS, sgemm_small_kernel_ntTS, sgemm_small_kernel_tnTS, sgemm_small_kernel_ttTS,
+  sgemm_small_kernel_b0_nnTS, sgemm_small_kernel_b0_ntTS, sgemm_small_kernel_b0_tnTS, sgemm_small_kernel_b0_ttTS,
+#endif
+#endif
+
 #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) 
   strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
@@ -257,6 +270,11 @@ gotoblas_t TABLE_NAME = {
 #endif
 
 #if  (BUILD_DOUBLE==1)  
+#ifdef SMALL_MATRIX_OPT
+  dgemm_small_matrix_permitTS,
+  dgemm_small_kernel_nnTS, dgemm_small_kernel_ntTS, dgemm_small_kernel_tnTS, dgemm_small_kernel_ttTS,
+  dgemm_small_kernel_b0_nnTS, dgemm_small_kernel_b0_ntTS, dgemm_small_kernel_b0_tnTS, dgemm_small_kernel_b0_ttTS,
+#endif
   dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS,
 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
   dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS,
@@ -389,6 +407,18 @@ gotoblas_t TABLE_NAME = {
 #endif
   cgemm_oncopyTS, cgemm_otcopyTS,
 
+#ifdef SMALL_MATRIX_OPT
+  cgemm_small_matrix_permitTS,
+  cgemm_small_kernel_nnTS, cgemm_small_kernel_ntTS, cgemm_small_kernel_nrTS, cgemm_small_kernel_ncTS,
+  cgemm_small_kernel_tnTS, cgemm_small_kernel_ttTS, cgemm_small_kernel_trTS, cgemm_small_kernel_tcTS,
+  cgemm_small_kernel_rnTS, cgemm_small_kernel_rtTS, cgemm_small_kernel_rrTS, cgemm_small_kernel_rcTS,
+  cgemm_small_kernel_cnTS, cgemm_small_kernel_ctTS, cgemm_small_kernel_crTS, cgemm_small_kernel_ccTS,
+  cgemm_small_kernel_b0_nnTS, cgemm_small_kernel_b0_ntTS, cgemm_small_kernel_b0_nrTS, cgemm_small_kernel_b0_ncTS,
+  cgemm_small_kernel_b0_tnTS, cgemm_small_kernel_b0_ttTS, cgemm_small_kernel_b0_trTS, cgemm_small_kernel_b0_tcTS,
+  cgemm_small_kernel_b0_rnTS, cgemm_small_kernel_b0_rtTS, cgemm_small_kernel_b0_rrTS, cgemm_small_kernel_b0_rcTS,
+  cgemm_small_kernel_b0_cnTS, cgemm_small_kernel_b0_ctTS, cgemm_small_kernel_b0_crTS, cgemm_small_kernel_b0_ccTS,
+#endif
+
   ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
   ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
 
@@ -533,6 +563,18 @@ gotoblas_t TABLE_NAME = {
 #endif
   zgemm_oncopyTS, zgemm_otcopyTS,
 
+#ifdef SMALL_MATRIX_OPT
+  zgemm_small_matrix_permitTS,
+  zgemm_small_kernel_nnTS, zgemm_small_kernel_ntTS, zgemm_small_kernel_nrTS, zgemm_small_kernel_ncTS,
+  zgemm_small_kernel_tnTS, zgemm_small_kernel_ttTS, zgemm_small_kernel_trTS, zgemm_small_kernel_tcTS,
+  zgemm_small_kernel_rnTS, zgemm_small_kernel_rtTS, zgemm_small_kernel_rrTS, zgemm_small_kernel_rcTS,
+  zgemm_small_kernel_cnTS, zgemm_small_kernel_ctTS, zgemm_small_kernel_crTS, zgemm_small_kernel_ccTS,
+  zgemm_small_kernel_b0_nnTS, zgemm_small_kernel_b0_ntTS, zgemm_small_kernel_b0_nrTS, zgemm_small_kernel_b0_ncTS,
+  zgemm_small_kernel_b0_tnTS, zgemm_small_kernel_b0_ttTS, zgemm_small_kernel_b0_trTS, zgemm_small_kernel_b0_tcTS,
+  zgemm_small_kernel_b0_rnTS, zgemm_small_kernel_b0_rtTS, zgemm_small_kernel_b0_rrTS, zgemm_small_kernel_b0_rcTS,
+  zgemm_small_kernel_b0_cnTS, zgemm_small_kernel_b0_ctTS, zgemm_small_kernel_b0_crTS, zgemm_small_kernel_b0_ccTS,
+#endif
+
   ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
   ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
 
@@ -933,6 +975,77 @@ static void init_parameter(void) {
 
 }
 #else // (ARCH_ARM64)
+#if defined(ARCH_MIPS64)
+static void init_parameter(void) {
+  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
+  TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
+  TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
+  TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+
+  TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
+  TABLE_NAME.dgemm_r = 640;
+  TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
+  TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
+
+#ifdef EXPRECISION
+  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+  TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
+  TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
+  TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
+  TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
+#endif
+
+#if defined(USE_GEMM3M)
+#ifdef CGEMM3M_DEFAULT_P
+  TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
+#else
+  TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_P
+  TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
+#else
+  TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
+#endif
+
+#ifdef CGEMM3M_DEFAULT_Q
+  TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
+#else
+  TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_Q
+  TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
+#else
+  TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
+#endif
+
+#ifdef CGEMM3M_DEFAULT_R
+  TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
+#else
+  TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_R
+  TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
+#else
+  TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
+#endif
+
+#ifdef EXPRECISION
+  TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
+  TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
+  TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
+#endif
+#endif
+}
+#else // (ARCH_MIPS64)
 #if (ARCH_POWER)
 static void init_parameter(void) {
 
@@ -1405,7 +1518,7 @@ static void init_parameter(void) {
 #endif
 #endif
 
-#if defined(SKYLAKEX) || defined(COOPERLAKE)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 
 #ifdef DEBUG
   fprintf(stderr, "SkylakeX\n");
@@ -1711,6 +1824,13 @@ static void init_parameter(void) {
   fprintf(stderr, "L2 = %8d DGEMM_P  .. %d\n", l2, TABLE_NAME.dgemm_p);
 #endif
 
+#if BUILD_BFLOAT16==1
+  TABLE_NAME.sbgemm_r = (((BUFFER_SIZE -
+			       ((TABLE_NAME.sbgemm_p * TABLE_NAME.sbgemm_q *  4 + TABLE_NAME.offsetA
+				 + TABLE_NAME.align) & ~TABLE_NAME.align)
+			       ) / (TABLE_NAME.sbgemm_q *  4) - 15) & ~15);
+#endif
+
 #if BUILD_SINGLE==1
   TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
 			       ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q *  4 + TABLE_NAME.offsetA
@@ -1780,4 +1900,5 @@ static void init_parameter(void) {
 }
 #endif //POWER
 #endif //ZARCH
+#endif //(ARCH_MIPS64)
 #endif //(ARCH_ARM64)
diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL
index 594fd05e5..a8c958bb4 100644
--- a/kernel/sparc/KERNEL
+++ b/kernel/sparc/KERNEL
@@ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S
 endif
 
 ifndef ISMINKERNEL
-ISMINKERNEL = iamax.S
+ISMINKERNEL = imax.S
 endif
 
 ifndef IDMINKERNEL
-IDMINKERNEL = iamax.S
+IDMINKERNEL = imax.S
+endif
+
+ifndef ISMAXKERNEL
+ISMAXKERNEL = imax.S
+endif
+
+ifndef IDMAXKERNEL
+IDMAXKERNEL = imax.S
 endif
 
 ifndef SNRM2KERNEL
diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
index fde9eba8e..0d71201d6 100644
--- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
index fddf7560f..e775b4d76 100644
--- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
index 33afd2a61..d3d110811 100644
--- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
index b05bd6ee5..e56a768db 100644
--- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
index f960559a6..85a29ce57 100644
--- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
index cf842c9b5..5c128d7a4 100644
--- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S
+++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S
@@ -62,7 +62,7 @@
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH     prefetcht0
 #define PREFETCHSIZE  (8 * 21 + 4)
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
index 63c44c27a..73174e424 100644
--- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
index 4cb01e50a..ebe83ff40 100644
--- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S
@@ -63,7 +63,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
index 09d5d8e43..b26ffb473 100644
--- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
index 7d129e54c..c2c7caadc 100644
--- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S
@@ -63,7 +63,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
index d33599317..42526135c 100644
--- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
+++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S
@@ -61,7 +61,7 @@
 #define PREFETCHSIZE 84
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht1
 #define PREFETCHSIZE 84
 #endif
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index b92f480e9..bea7036c2 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -489,3 +489,5 @@ XGEMM3MKERNEL    =  xgemm3m_kernel_2x2.S
 
 SSUMKERNEL = ../arm/sum.c
 DSUMKERNEL = ../arm/sum.c
+
+SOMATCOPY_RT = omatcopy_rt.c
diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index 0b2f3c0ed..dba94aea8 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -1 +1,22 @@
 include $(KERNELDIR)/KERNEL.SKYLAKEX
+
+SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
+SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
+SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
+SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c
+SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_nt_cooperlake.c
+SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c
+SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c
+SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c
+SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c
+
+SBGEMM_BETA     = sgemm_beta_skylakex.c
+SBGEMMKERNEL    = sbgemm_kernel_16x4_cooperlake.c
+SBGEMMINCOPY    = sbgemm_ncopy_16_cooperlake.c
+SBGEMMITCOPY    = sbgemm_tcopy_16_cooperlake.c
+SBGEMMONCOPY    = sbgemm_ncopy_4_cooperlake.c
+SBGEMMOTCOPY    = sbgemm_tcopy_4_cooperlake.c
+SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
+SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
new file mode 100644
index 000000000..88f574668
--- /dev/null
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -0,0 +1,14 @@
+include $(KERNELDIR)/KERNEL.COOPERLAKE
+
+SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c
+
+SBGEMM_BETA     = sgemm_beta_skylakex.c
+SBGEMMKERNEL    = sbgemm_kernel_16x16_spr.c
+SBGEMMINCOPY    = sbgemm_ncopy_16_cooperlake.c
+SBGEMMITCOPY    = sbgemm_tcopy_16_cooperlake.c
+SBGEMMONCOPY    = sbgemm_oncopy_16_spr.c
+SBGEMMOTCOPY    = sbgemm_otcopy_16_spr.c
+SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
+SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 3d71584fe..d2d7de42a 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -10,7 +10,17 @@ STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
+SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c
+SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c
+SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_skylakex.c
+SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c
+SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_skylakex.c
+SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c
+SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_skylakex.c
+SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c
+SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_skylakex.c
 
+ifndef DYNAMIC_ARCH
 DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
 DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
@@ -18,6 +28,20 @@ DGEMMITCOPY    =  dgemm_tcopy_16_skylakex.c
 DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+else
+DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex_2.c
+DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
+DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
+endif
+DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c
+DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c
+DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_skylakex.c
+DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c
+DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_skylakex.c
+DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c
+DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_skylakex.c
+DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c
+DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_skylakex.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c
diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN
index 7bb308fea..a66394be3 100644
--- a/kernel/x86_64/KERNEL.ZEN
+++ b/kernel/x86_64/KERNEL.ZEN
@@ -97,3 +97,5 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
 ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c
 
+SROTKERNEL = srot.c
+DROTKERNEL = drot.c
diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h
index 1014ecc4d..cdb4beff6 100644
--- a/kernel/x86_64/bf16_common_macros.h
+++ b/kernel/x86_64/bf16_common_macros.h
@@ -29,6 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <immintrin.h>
 
+#define _MM512_BROADCASTD_EPI32(addr, zmm)             \
+                    __asm__ ("vpbroadcastd (%1), %0;"  \
+                            : "=v" (zmm)               \
+                            : "r"  (addr) )
+
+#define PREFETCH_T0(addr)             \
+                    __asm__ ("prefetcht0 (%0);"  \
+                            :                    \
+                            : "r"  (addr) )
+
 #define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512)   \
     reg256##_0 = _mm512_castps512_ps256(reg512##_0);  \
     reg256##_1 = _mm512_castps512_ps256(reg512##_1);
@@ -46,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n)      \
-    regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]);  \
-    regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]);  \
-    regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]);  \
-    regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]);  \
-    regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]);  \
-    regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]);  \
-    regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]);  \
-    regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
+    regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n]));  \
+    regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n]));  \
+    regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n]));  \
+    regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n]));  \
+    regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n]));  \
+    regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n]));  \
+    regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n]));  \
+    regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n]));
 
 
 #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n)    \
-    regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]);  \
-    regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]);  \
-    regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]);  \
-    regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]);  \
-    regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]);  \
-    regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]);  \
-    regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]);  \
-    regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
+    regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n]));  \
+    regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n]));  \
+    regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n]));  \
+    regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n]));  \
+    regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n]));  \
+    regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n]));  \
+    regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n]));  \
+    regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n]));
 
 
 #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n)       \
@@ -143,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n)     \
-    reg = _mm256_loadu_si256(x + idx_n);
+    reg = _mm256_loadu_si256((__m256i *)(x + idx_n));
 
 
 #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n)      \
-    reg = _mm_loadu_si128(x + idx_n);
+    reg = _mm_loadu_si128((__m128i *)(x + idx_n));
 
 
 #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask)     \
@@ -721,6 +731,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     _mm_mask_storeu_ps(targetAddr, mask, regResult);
 
 
+/* Store 16 (result + y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr)          \
+    regResult = _mm512_add_ps(regResult, _mm512_loadu_ps(targetAddr));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (result + y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask)           \
+    regResult = _mm512_add_ps(regResult, _mm512_maskz_loadu_ps(mask, targetAddr));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (result + y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr)           \
+    regResult = _mm256_add_ps(regResult, _mm256_loadu_ps(targetAddr));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (result + y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask)            \
+    regResult = _mm256_add_ps(regResult, _mm256_maskz_loadu_ps(mask, targetAddr));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (result + y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr)     \
+    regResult = _mm_add_ps(regResult, _mm_loadu_ps(targetAddr));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (result + y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask)      \
+    regResult = _mm_add_ps(regResult, _mm_maskz_loadu_ps(mask, targetAddr));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
 /* Store 16 (alpha * result) to y
 */
 #define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index a1bd76f33..60feec0ce 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         mode = BLAS_DOUBLE | BLAS_COMPLEX;
 #endif
         blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
-                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+                NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);
diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index d51929f9f..b398aa6e1 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     
     if (n2 < 64) {
         __m128 accum_10, accum_11, accum_12, accum_13;
-        __m128 abs_mask1;
+        __m128 abs_mask1 = abs_mask1;
 
         accum_10 = _mm_setzero_ps();
         accum_11 = _mm_setzero_ps();
diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c
index c19b98f02..7270a98bc 100644
--- a/kernel/x86_64/caxpy.c
+++ b/kernel/x86_64/caxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caxpy_microk_steamroller-2.c"
 #elif defined(BULLDOZER)
 #include "caxpy_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "caxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "caxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c
index f2bf19dcd..264776239 100644
--- a/kernel/x86_64/cdot.c
+++ b/kernel/x86_64/cdot.c
@@ -27,14 +27,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common.h"
-#include <complex.h>
 
 
 #if defined(BULLDOZER)
 #include "cdot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER) || defined(PILEDRIVER)  || defined(EXCAVATOR)
 #include "cdot_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "cdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "cdot_microk_sandy-2.c"
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index 0ed02b8d8..3ca173c20 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdio.h>
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "cgemv_n_microk_haswell-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "cgemv_n_microk_bulldozer-4.c"
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index c2903b11f..3187e196c 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "cgemv_t_microk_haswell-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "cgemv_t_microk_bulldozer-4.c"
diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c
index 6d75358a6..dc3f688c6 100644
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "cscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "cscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index ddec21383..a9c40f38f 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -6,7 +6,7 @@
 
 #if defined(SKYLAKEX)
 #include "dasum_microk_skylakex-2.c"
-#elif defined(HASWELL)
+#elif defined(HASWELL) || defined(ZEN)
 #include "dasum_microk_haswell-2.c"
 #endif
 
@@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #if defined(SMP)
     int nthreads;
     FLOAT dummy_alpha;
-    FLOAT * dummy_b;
 #endif
     FLOAT sumf = 0.0;
 
@@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
         mode = BLAS_DOUBLE | BLAS_REAL;
 #endif
-        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);
diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c
index 4fc73ddd4..fd9da7ebe 100644
--- a/kernel/x86_64/dasum_microk_haswell-2.c
+++ b/kernel/x86_64/dasum_microk_haswell-2.c
@@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 
          __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
         for (i = 0; i < tail_index_AVX2; i += 16) {
-            accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
-            accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
-            accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
-            accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
+            accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
+            accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask);
+            accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
+            accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask);
         }
 
         accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
         for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
-            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
-            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
+            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
+            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
+            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
         }
 
         accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c
index aea8c02d9..83bc078b3 100644
--- a/kernel/x86_64/dasum_microk_skylakex-2.c
+++ b/kernel/x86_64/dasum_microk_skylakex-2.c
@@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
         for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
-            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
-            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
+            accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2);
+            accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
+            accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2);
         }
 
         accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c
index 26437012c..2796b8270 100644
--- a/kernel/x86_64/daxpy.c
+++ b/kernel/x86_64/daxpy.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "daxpy_microk_piledriver-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "daxpy_microk_haswell-2.c"
-#elif defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "daxpy_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "daxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index e4b6622e6..f3b9ee701 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ddot_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "ddot_microk_haswell-2.c"
-#elif defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "ddot_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ddot_microk_sandy-2.c"
@@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #endif
 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
-				   ( void *)dot_thread_function, nthreads);
+				    (int (*)(void)) dot_thread_function, nthreads);
 
 		ptr = (RETURN_TYPE *)result;
 		for (i = 0; i < nthreads; i++) {
diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
index 9f2bf24e2..15185d7fc 100644
--- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
@@ -149,6 +149,7 @@
  #define KERNEL_h_k1m16n2 \
   "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\
   unit_acc_m16n2(8,9,10,11,%1)
+
 #endif
 #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
 #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
@@ -283,7 +284,32 @@
 #define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1)
 #define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;"
 #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2)
-#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
+//#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
+#define unit_acc_k2m4n2(c1_no,c2_no,...)\
+  "vbroadcastf64x4 ("#__VA_ARGS__"),%%zmm3; vpermpd %%zmm3,%%zmm30,%%zmm3;"\
+  "vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"
+
+#define unit_merge_to_ymm(c1_no) \
+  "vextractf64x4 $1,%%zmm"#c1_no",%%ymm30; vaddpd %%ymm"#c1_no",%%ymm30,%%ymm"#c1_no";"
+
+#define KERNEL_k1m4n12 \
+  "cmpq $2, %5; jb 104912f;"\
+  "vmovupd 64+%11,%%zmm30;"\
+  "\n204912:"\
+  "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \
+  unit_acc_k2m4n2(4,5,%1) unit_acc_k2m4n2(6,7,%1,%%r12,1) unit_acc_k2m4n2(8, 9, %1, %%r12, 2) "addq $32,%1;" \
+  unit_acc_k2m4n2(10,11,%%r15) unit_acc_k2m4n2(12,13,%%r15,%%r12,1) unit_acc_k2m4n2(14,15,%%r15,%%r12,2) "addq $32,%%r15;" \
+  "subq $2, %5; cmpq $2, %5; jnb 204912b;"\
+  unit_merge_to_ymm(4) unit_merge_to_ymm(5) unit_merge_to_ymm(6) unit_merge_to_ymm(7) \
+  unit_merge_to_ymm(8) unit_merge_to_ymm(9) unit_merge_to_ymm(10) unit_merge_to_ymm(11) \
+  unit_merge_to_ymm(12) unit_merge_to_ymm(13) unit_merge_to_ymm(14) unit_merge_to_ymm(15) \
+  "testq %5, %5; jz 1004912f;"\
+  "\n104912:"\
+  KERNEL_h_k1m4n12 "addq $16,%%r15;"\
+  "decq %5; jnz 104912b;"\
+  "\n1004912:"\
+  "incq %5;"
+
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
   #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
@@ -336,7 +362,31 @@
 #define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1)
 #define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;"
 #define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2)
-#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
+//#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
+
+#define unit_acc_k4m2n2(c1_no,c2_no,...) \
+  "vmovupd ("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"
+
+#define unit_merge_to_xmm(c1_no) \
+  "vextractf64x2 $0,%%zmm"#c1_no",%%xmm20; vextractf64x2 $1,%%zmm"#c1_no",%%xmm21; vextractf64x2 $2,%%zmm"#c1_no",%%xmm22; vextractf64x2 $3,%%zmm"#c1_no",%%xmm23;"\
+  "vaddpd %%xmm20,%%xmm21,%%xmm20; vaddpd %%xmm22,%%xmm23,%%xmm22; vaddpd %%xmm20,%%xmm22,%%xmm"#c1_no";"
+
+#define KERNEL_k1m2n12 \
+  "cmpq $4,%5; jb 102912f;"\
+  "\n402912:"\
+  "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \
+  unit_acc_k4m2n2(4,5,%1) unit_acc_k4m2n2(6,7,%1,%%r12,1) unit_acc_k4m2n2(8,9,%1,%%r12,2) "addq $64,%1;" \
+  unit_acc_k4m2n2(10,11,%%r15) unit_acc_k4m2n2(12,13,%%r15,%%r12,1) unit_acc_k4m2n2(14,15,%%r15,%%r12,2) "addq $64,%%r15;" \
+  "subq $4,%5; cmpq $4,%5; jnb 402912b;"\
+  unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \
+  unit_merge_to_xmm(10) unit_merge_to_xmm(11) unit_merge_to_xmm(12) unit_merge_to_xmm(13) unit_merge_to_xmm(14) unit_merge_to_xmm(15) \
+  "testq %5,%5; jz 1002912f;"\
+  "\n102912:"\
+  KERNEL_h_k1m2n12 "addq $16,%%r15;" \
+  "decq %5; jnz 102912b;" \
+  "\n1002912:"\
+  "incq %5;"
+
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
   #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
@@ -387,7 +437,24 @@
 #define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;"
 #define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;"
 #define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;"
-#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
+//#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
+#define KERNEL_k1m1n12 \
+  "cmpq $4,%5; jb 101912f;" \
+  "vmovupd %11,%%zmm2;"\
+  "\n401912:"\
+  "vmovupd (%0),%%ymm1; vpermpd %%zmm1,%%zmm2,%%zmm1; addq $32,%0;" \
+  "vfmadd231pd (%1),%%zmm1,%%zmm4; vfmadd231pd (%1,%%r12,1),%%zmm1,%%zmm5; vfmadd231pd (%1,%%r12,2),%%zmm1,%%zmm6; addq $64,%1;"\
+  "vfmadd231pd (%%r15),%%zmm1,%%zmm7; vfmadd231pd (%%r15,%%r12,1),%%zmm1,%%zmm8; vfmadd231pd (%%r15,%%r12,2),%%zmm1,%%zmm9; addq $64,%%r15;"\
+  "subq $4,%5; cmpq $4,%5; jnb 401912b;"\
+  unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) \
+  unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \
+  "testq %5,%5; jz 1001912f;"\
+  "\n101912:"\
+  KERNEL_h_k1m1n12 "addq $16,%%r15;" \
+  "decq %5; jnz 101912b;" \
+  "\n1001912:"\
+  "incq %5;"
+
 #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
   #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
   #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
@@ -480,7 +547,7 @@
     COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\
     #ndim"33106:\n\t"\
     "movq %%r14,%1;"\
-  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\
+  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K), "o"(permute_table):"r10","r11","r12","r13","r14","r15","cc","memory",\
     "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\
     "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\
   a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\
@@ -501,6 +568,10 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
     int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0;
     BLASLONG n_count = n, off = 0;
     double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B;
+    int64_t permute_table[] = {
+	    0, 0, 1, 1, 2, 2, 3, 3,  // abcdxxxx -> aabbccdd
+	    0, 1, 0, 1, 2, 3, 2, 3,  // abcdxxxx -> ababcdcd
+    };
 #ifdef TRMMKERNEL
   #ifdef LEFT
     off = offset;
diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
new file mode 100644
index 000000000..df6c65ff7
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -0,0 +1,595 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)])
+#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)])
+#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_pd1(&B[k + ldb * (j+N)]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \
+	_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#endif
+
+#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&mbuf[(mi + M)*K + k]);
+#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &mbuf[(mi + M)*K + k])
+#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k])
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512d r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \
+	r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \
+	t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \
+	t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \
+	r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \
+	__m256d s0, s1; \
+	s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \
+	s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0);
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N);
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	_mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	_mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \
+}
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M];
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	s1 = _mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8); \
+	s0 = _mm256_fmadd_pd(s1, beta_256, s0); \
+	_mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \
+}
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&alpha));
+#if !defined(B0)
+	__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&beta));
+#endif
+
+	for (i = 0; i < m32; i += 32) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m8; i += 8) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (!mm) return 0;
+	if (mm > 4 || K < 16) {
+		register __mmask8 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	} else {
+		/* M => [1, 4]
+		 *
+		 * This kernel use dot-like style to calc a value - C(x, y):
+		 * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y)
+		 *
+		 * Alloc a buf to copy rest of A as row major,
+		 * so memory access from 0 to K is continuous for both A & B.
+		 *
+		 * Loading to zmm and FMA 8 of k at one loop,
+		 * finally reduce_add zmm to a single float result in C(x, y).
+		 *
+		 * Note: performance is bad when K is small.
+		 */
+		FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K);
+		__mmask8 mask = (1UL << mm) - 1;
+		BLASLONG k8 = K & ~7;
+		BLASLONG k4 = K & ~3;
+		for (k = 0; k < k4; k += 4) {
+			__m256d  r0, r1, r2, r3;
+			__m256d  t0, t1, t2, t3;
+			r0 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(0 + k)]);
+			r1 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(1 + k)]);
+			r2 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(2 + k)]);
+			r3 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(3 + k)]);
+
+			t0 = _mm256_unpacklo_pd(r0, r1);
+			t1 = _mm256_unpackhi_pd(r0, r1);
+			t2 = _mm256_unpacklo_pd(r2, r3);
+			t3 = _mm256_unpackhi_pd(r2, r3);
+
+			r0 = _mm256_permute2f128_pd(t0, t2, 0x20);
+			r1 = _mm256_permute2f128_pd(t1, t3, 0x20);
+			r2 = _mm256_permute2f128_pd(t0, t2, 0x31);
+			r3 = _mm256_permute2f128_pd(t1, t3, 0x31);
+
+			switch (mm) {
+				case 4: _mm256_storeu_pd(&mbuf[k + 3*K], r3);
+				case 3: _mm256_storeu_pd(&mbuf[k + 2*K], r2);
+				case 2: _mm256_storeu_pd(&mbuf[k + 1*K], r1);
+				case 1: _mm256_storeu_pd(&mbuf[k + 0*K], r0);
+			}
+		}
+		for (; k < K; k++) {
+			for (int ii = 0; ii < mm; ii++) {
+				mbuf[k + ii*K] = A[i + lda*k + ii];
+			}
+		}
+		int mi = 0;
+		__m256d alpha_256 = _mm256_broadcast_sd(&alpha);
+#if !defined(B0)
+		__m256d beta_256 = _mm256_broadcast_sd(&beta);
+#endif
+		__m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc*1, 0);
+		long long permute_table[] = {
+			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
+			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
+		};
+		__m512i idx_lo = _mm512_loadu_si512(permute_table);
+		__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
+		for (; i < m4; i += 4, mi += 4) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				STORE_REDUCE_M4(0);
+			}
+
+		}
+		for (; i < m2; i += 2, mi += 2) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			}
+		}
+		for (; i < M; i += 1, mi += 1) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				STORE_REDUCE_N4(0);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				STORE_REDUCE(0, 0);
+				STORE_REDUCE(0, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < k8; k += 8) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				STORE_REDUCE(0, 0);
+			}
+		}
+		free(mbuf);
+	}
+	return 0;
+}
+#else
+#include "../generic/gemm_small_matrix_kernel_nn.c"
+#endif
+
diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
new file mode 100644
index 000000000..e757197ba
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c
@@ -0,0 +1,535 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)])
+#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)])
+#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_sd(&B[ldb * k + j + N]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+
+#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[lda * k + i + M]))
+#define LOAD_B_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)])
+#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)])
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+			_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				_mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \
+	_mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+	asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8);
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n32 = N & ~31;
+	BLASLONG n16 = N & ~15;
+	BLASLONG n8 = N & ~7;
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha));
+#if !defined(B0)
+	__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
+#endif
+
+	for (i = 0; i < m32; i += 32) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+			STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4);
+			STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6);
+			DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+				MATMUL_512(0, 6); MATMUL_512(1, 6);
+				MATMUL_512(0, 7); MATMUL_512(1, 7);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+			STORE_512(0, 6); STORE_512(1, 6);
+			STORE_512(0, 7); STORE_512(1, 7);
+		}
+		for (;j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m8; i += 8) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+			STORE_512(0, 6);
+			STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (mm >= 6) {
+		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+			MASK_STORE_512(0, 6);
+			MASK_STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	} else if (mm > 0) {
+		long long index_n[8];
+		for (int ii = 0; ii < 8; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
+		for (; i < m4; i += 4) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+			}
+			__mmask8 mask = 0xff;
+			for (; j < N; j += 8) {
+				int remains = N - j;
+				if (remains < 8) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0);
+			}
+		}
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask8 mask = 0xff;
+			for (; j < N; j += 8) {
+				int remains = N - j;
+				if (remains < 8) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask8 mask = 0xff;
+			for (; j < N; j += 8) {
+				int remains = N - j;
+				if (remains < 8) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c
new file mode 100644
index 000000000..9cca08e71
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c
@@ -0,0 +1,44 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK > 100.0*100.0*100.0)  // disable for big size matrix
+		return 0;
+	if (transa && !transb) {
+		/* TN kernel perform not good when:
+		 * 1. C matrix is too big
+		 * 2. K is too small
+		 */
+		if (M * N > 1200 || K < 32)
+			return 0;
+	}
+	return 1;
+}
diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
new file mode 100644
index 000000000..37d1ca497
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
@@ -0,0 +1,327 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+
+#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[(i + M)*lda + k]);
+#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[(i + M)*lda + k])
+#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k])
+
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512d r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \
+	r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \
+	t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \
+	t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \
+	r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \
+	__m256d s0, s1; \
+	s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \
+	s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0);
+
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N)
+#define STORE_M4(N, s0) _mm256_storeu_pd(&C[(j + N)*ldc + i], s0);
+#define STORE_N4(M, s0) _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8);
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M]
+#define STORE_M4(N, s0) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_pd(&C[(j + N)*ldc + i], s0);
+
+#define STORE_N4(M, s0) \
+	s0 = _mm256_fmadd_pd(_mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8), beta_256, s0); \
+	_mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8);
+#endif
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	STORE_M4(N, s0) \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	STORE_N4(M, s0) \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+	BLASLONG k8 = K & ~7;
+
+	__mmask8 mask;
+
+	__m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc, 0);
+	__m256d alpha_256 = _mm256_broadcast_sd(&alpha);
+#if !defined(B0)
+	__m256d beta_256 = _mm256_broadcast_sd(&beta);
+#endif
+
+	long long permute_table[] = {
+		0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
+		2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
+	};
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
+
+	for (i = 0; i < m4; i += 4) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_REDUCE_M4(0);
+		}
+
+	}
+	for (; i < m2; i += 2) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+		}
+	}
+	for (; i < M; i += 1) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_REDUCE_N4(0);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_REDUCE(0, 0);
+			STORE_REDUCE(0, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < k8; k += 8) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			STORE_REDUCE(0, 0);
+		}
+	}
+	return 0;
+}
+#else
+#include "../generic/gemm_small_matrix_kernel_tn.c"
+#endif
+
diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
new file mode 100644
index 000000000..00f42aa76
--- /dev/null
+++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c
@@ -0,0 +1,392 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+
+#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd()
+#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[k  + lda * (i+M)]))
+#define LOAD_B_512(M,N)  __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)])
+#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)])
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N)
+
+#if defined(B0)
+#define STORE_8xy(v, N, x, y) _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				    _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8);
+#else
+#define STORE_8xy(v, N, x, y) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*8)*ldc + i]), "v"(beta_512)); \
+	_mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) \
+	asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*4)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \
+				__m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \
+				result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8);
+#endif
+
+#define REORDER_8x8(r0, r1, r2, r3, r4, r5, r6, r7) \
+	__m512d t0, t1, t2, t3, t4, t5, t6, t7; \
+	t0 = _mm512_unpacklo_pd(r0, r1); \
+	t1 = _mm512_unpackhi_pd(r0, r1); \
+	t2 = _mm512_unpacklo_pd(r2, r3); \
+	t3 = _mm512_unpackhi_pd(r2, r3); \
+	t4 = _mm512_unpacklo_pd(r4, r5); \
+	t5 = _mm512_unpackhi_pd(r4, r5); \
+	t6 = _mm512_unpacklo_pd(r6, r7); \
+	t7 = _mm512_unpackhi_pd(r6, r7); \
+	r0 = _mm512_shuffle_f64x2(t0, t2, 0x88); \
+	r1 = _mm512_shuffle_f64x2(t1, t3, 0x88); \
+	r2 = _mm512_shuffle_f64x2(t0, t2, 0xdd); \
+	r3 = _mm512_shuffle_f64x2(t1, t3, 0xdd); \
+	r4 = _mm512_shuffle_f64x2(t4, t6, 0x88); \
+	r5 = _mm512_shuffle_f64x2(t5, t7, 0x88); \
+	r6 = _mm512_shuffle_f64x2(t4, t6, 0xdd); \
+	r7 = _mm512_shuffle_f64x2(t5, t7, 0xdd); \
+	t0 = _mm512_permutex2var_pd(r0, idx_lo, r4); \
+	t1 = _mm512_permutex2var_pd(r1, idx_lo, r5); \
+	t2 = _mm512_permutex2var_pd(r2, idx_lo, r6); \
+	t3 = _mm512_permutex2var_pd(r3, idx_lo, r7); \
+	t4 = _mm512_permutex2var_pd(r0, idx_hi, r4); \
+	t5 = _mm512_permutex2var_pd(r1, idx_hi, r5); \
+	t6 = _mm512_permutex2var_pd(r2, idx_hi, r6); \
+	t7 = _mm512_permutex2var_pd(r3, idx_hi, r7); \
+	t0 = _mm512_mul_pd(t0, alpha_512); \
+	t1 = _mm512_mul_pd(t1, alpha_512); \
+	t2 = _mm512_mul_pd(t2, alpha_512); \
+	t3 = _mm512_mul_pd(t3, alpha_512); \
+	t4 = _mm512_mul_pd(t4, alpha_512); \
+	t5 = _mm512_mul_pd(t5, alpha_512); \
+	t6 = _mm512_mul_pd(t6, alpha_512); \
+	t7 = _mm512_mul_pd(t7, alpha_512);
+
+#define SAVE_8(N, x) {\
+	STORE_8xy(t##x, N, x, 0); \
+}
+
+#define REORDER_STORE_8x8(N) {\
+	REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	SAVE_8(N, 0); SAVE_8(N, 1); SAVE_8(N, 2); SAVE_8(N, 3); SAVE_8(N, 4); SAVE_8(N, 5); SAVE_8(N, 6); SAVE_8(N, 7); \
+}
+
+#define MASK_SAVE_8() \
+	switch (nn) { \
+		case 8: SAVE_8(0, 7); \
+		case 7: SAVE_8(0, 6); \
+		case 6: SAVE_8(0, 5); \
+		case 5: SAVE_8(0, 4); \
+		case 4: SAVE_8(0, 3); \
+		case 3: SAVE_8(0, 2); \
+		case 2: SAVE_8(0, 1); \
+		case 1: SAVE_8(0, 0); \
+	}
+
+#define MASK_REORDER_STORE_8x8(N) {\
+	REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	MASK_SAVE_8(); \
+}
+
+#define REORDER_4x8(r0, r1, r2, r3) \
+	__m512d t0, t1, t2, t3; \
+	t0 = _mm512_unpacklo_pd(r0, r1); \
+	t1 = _mm512_unpackhi_pd(r0, r1); \
+	t2 = _mm512_unpacklo_pd(r2, r3); \
+	t3 = _mm512_unpackhi_pd(r2, r3); \
+	r0 = _mm512_permutex2var_pd(t0, idx_lo, t2); \
+	r1 = _mm512_permutex2var_pd(t1, idx_lo, t3); \
+	r2 = _mm512_permutex2var_pd(t0, idx_hi, t2); \
+	r3 = _mm512_permutex2var_pd(t1, idx_hi, t3); \
+	t0 = _mm512_mul_pd(r0, alpha_512); \
+	t1 = _mm512_mul_pd(r1, alpha_512); \
+	t2 = _mm512_mul_pd(r2, alpha_512); \
+	t3 = _mm512_mul_pd(r3, alpha_512);
+
+#define SAVE_4(N, x, y) {\
+	__m256d v4 = _mm512_extractf64x4_pd(t##x, y); \
+	STORE_4xy(v4, N, x, y); \
+}
+
+#define REORDER_STORE_4x8(N) {\
+	REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \
+	SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \
+	SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \
+}
+
+#define MASK_SAVE_4() \
+	switch (nn) { \
+		case 8: SAVE_4(0, 3, 1); \
+		case 7: SAVE_4(0, 2, 1); \
+		case 6: SAVE_4(0, 1, 1); \
+		case 5: SAVE_4(0, 0, 1); \
+		case 4: SAVE_4(0, 3, 0); \
+		case 3: SAVE_4(0, 2, 0); \
+		case 2: SAVE_4(0, 1, 0); \
+		case 1: SAVE_4(0, 0, 0); \
+	}
+
+#define MASK_REORDER_STORE_4x8(N) {\
+	REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \
+	MASK_SAVE_4(); \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n32 = N & ~31;
+	BLASLONG n16 = N & ~15;
+
+	__m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha));
+#if !defined(B0)
+	__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
+	__m256d beta_256 = _mm256_broadcastsd_pd(_mm_load_sd(&beta));
+#endif
+	long long permute_table[] = {
+		0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8,
+		2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8,
+	};
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 8);
+
+	for (i = 0; i < m8; i += 8) {
+		for (j = 0; j < n16; j += 16) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1);
+			}
+			REORDER_STORE_8x8(0);
+			REORDER_STORE_8x8(1);
+		}
+		__mmask8 mask = 0xff;
+		int nn = 8;
+		for (; j < N; j += 8) {
+			if (N - j < 8) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+			}
+			MASK_REORDER_STORE_8x8(0);
+		}
+	}
+	for (; i < m4; i += 4) {
+		long long permute_table2[] = {
+			0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8,
+			2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8,
+		};
+		idx_lo = _mm512_loadu_si512(permute_table2);
+		idx_hi = _mm512_loadu_si512(permute_table2 + 8);
+
+		for (j = 0; j < n32; j += 32) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			REORDER_STORE_4x8(0);
+			REORDER_STORE_4x8(1);
+			REORDER_STORE_4x8(2);
+			REORDER_STORE_4x8(3);
+		}
+		for (; j < n16; j += 16) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			REORDER_STORE_4x8(0);
+			REORDER_STORE_4x8(1);
+		}
+		__mmask8 mask = 0xff;
+		int nn = 8;
+		for (; j < N; j += 8) {
+			if (N - j < 8) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			MASK_REORDER_STORE_4x8(0);
+		}
+	}
+	if (i < M) {
+		long long index_n[8];
+		for (int ii = 0; ii < 8; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
+#if !defined(B0)
+		__m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta));
+#endif
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask8 mask = 0xff;
+			int nn = 8;
+			for (; j < N; j += 8) {
+				if (N - j < 8) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n16; j += 16) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask8 mask = 0xff;
+			int nn = 8;
+			for (; j < N; j += 8) {
+				if (N - j < 8) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index da68db0cd..f883d4f26 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dgemv_n_microk_nehalem-4.c"
 #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "dgemv_n_microk_haswell-4.c"
-#elif  defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif  defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "dgemv_n_microk_skylakex-4.c"
 #endif
 
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index a3bf28dc8..9688c6bf3 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER)  || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER)  || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "dgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
index 66e9ff907..40c9cf19d 100644
--- a/kernel/x86_64/drot.c
+++ b/kernel/x86_64/drot.c
@@ -2,7 +2,7 @@
 
 #if defined(SKYLAKEX)
 #include "drot_microk_skylakex-2.c"
-#elif defined(HASWELL)
+#elif defined(HASWELL) || defined(ZEN)
 #include "drot_microk_haswell-2.c"
 #endif
 
@@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 #else
 	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
 #endif
-	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads);
     }
 #else	
     rot_compute(n, x, inc_x, y, inc_y, c, s);
diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c
index 72a87696e..cc5949b1a 100644
--- a/kernel/x86_64/drot_microk_haswell-2.c
+++ b/kernel/x86_64/drot_microk_haswell-2.c
@@ -1,6 +1,4 @@
-/* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
-
+#if defined(HAVE_FMA3)  && defined(HAVE_AVX2)
 #define HAVE_DROT_KERNEL 1
 
 #include <immintrin.h>
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index d1270d20b..05c5c7f16 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dscal_microk_sandy-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "dscal_microk_haswell-2.c"
-#elif  defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif  defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "dscal_microk_skylakex-2.c"
 #endif
 
diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c
index 573377ee0..590776005 100644
--- a/kernel/x86_64/dsymv_L.c
+++ b/kernel/x86_64/dsymv_L.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dsymv_L_microk_bulldozer-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "dsymv_L_microk_haswell-2.c"
-#elif defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "dsymv_L_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_L_microk_sandy-2.c"
diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c
index 530ac8b1d..f196aa364 100644
--- a/kernel/x86_64/dsymv_U.c
+++ b/kernel/x86_64/dsymv_U.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "dsymv_U_microk_bulldozer-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "dsymv_U_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "dsymv_U_microk_sandy-2.c"
diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c
new file mode 100644
index 000000000..e695f00c5
--- /dev/null
+++ b/kernel/x86_64/omatcopy_rt.c
@@ -0,0 +1,373 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#ifdef HAVE_AVX
+
+#define ROWS_OF_BLOCK 384
+
+  /* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */
+/* m: %5 = num_rows, %6 = alpha */
+/* xmm15 = alpha */
+#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\
+  "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\
+  "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\
+  "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\
+  "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";"
+
+#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\
+  "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\
+  "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\
+  "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\
+  "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";"
+
+#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\
+  "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
+  "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;"
+
+#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\
+  "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
+  "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;"
+
+#define COPY_4x16 "movq %1,%4; addq $16,%1;"\
+  "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\
+  "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\
+  TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\
+  TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7)
+
+#define COPY_4x8 "movq %1,%4; addq $16,%1;"\
+  "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\
+  "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\
+  TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)
+
+#define COPY_4x4 "movq %1,%4; addq $16,%1;"\
+  "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\
+  "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\
+  TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3)
+
+#define COPY_4x2 \
+  "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
+  "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\
+  "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\
+  "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;"
+
+#define COPY_4x1 \
+  "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
+  "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
+  "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;"
+
+#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \
+  "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\
+  "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
+  "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\
+  "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;"
+
+#define COPY_2x16 "movq %1,%4; addq $8,%1;"\
+  "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\
+  "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\
+  SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9)
+
+#define COPY_2x8 "movq %1,%4; addq $8,%1;"\
+  "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\
+  "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\
+  SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5)
+
+#define COPY_2x4 "movq %1,%4; addq $8,%1;"\
+  "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\
+  SAVE_2x4(0,1,4,5)
+
+#define COPY_2x2 \
+  "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\
+  "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;"
+
+#define COPY_2x1 \
+  "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;"
+
+#define SAVE_1x4(c1_no)\
+  "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
+  "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"
+
+#define COPY_1x16 "movq %1,%4; addq $4,%1;"\
+  "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\
+  "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;"
+
+#define COPY_1x8 "movq %1,%4; addq $4,%1;"\
+  "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;"
+
+#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;"
+
+#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;"
+
+#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;"
+
+#define COMPUTE(ndim){\
+  src = src_base; dst = dst_base;\
+  __asm__ __volatile__(\
+    "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\
+    #ndim"31:\n\t"\
+    COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\
+    #ndim"32:\n\t"\
+    "cmpq $2,%%r11; jb "#ndim"33f;"\
+    COPY_2x##ndim "subq $2,%%r11;"\
+    #ndim"33:\n\t"\
+    "testq %%r11,%%r11; jz "#ndim"34f;"\
+    COPY_1x##ndim "subq $1,%%r11;"\
+    #ndim"34:\n\t"\
+    :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\
+    ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+}
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){
+  float *src, *dst, *dst_tmp, *src_base, *dst_base;
+  uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0;
+  BLASLONG cols_left, rows_done; float ALPHA = alpha;
+  if(ALPHA==0.0){
+    dst_base = b;
+    for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;}
+    return 0;
+  }
+  for(rows_done=0;rows_done<rows;rows_done+=num_rows){
+    num_rows = rows-rows_done;
+    if(num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK;
+    cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done;
+    if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;}
+    for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;}
+    for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;}
+    for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;}
+    if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;}
+  }
+  return 0;
+}
+
+#else
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+    BLASLONG i, j;
+    FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
+
+    if (rows <= 0)  return 0;
+    if (cols <= 0)  return 0;
+
+    a_offset = a;
+    b_offset = b;
+
+    i = (rows >> 2);
+    if (i > 0) {
+        do {
+            a_offset1 = a_offset;
+            a_offset2 = a_offset1 + lda;
+            a_offset3 = a_offset2 + lda;
+            a_offset4 = a_offset3 + lda;
+            a_offset += 4 * lda;
+
+            b_offset1 = b_offset;
+            b_offset2 = b_offset1 + ldb;
+            b_offset3 = b_offset2 + ldb;
+            b_offset4 = b_offset3 + ldb;
+            b_offset += 4;
+
+            j = (cols >> 2);
+            if (j > 0) {
+                do {
+                /* Column 1 of MAT_B */
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+                *(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
+                *(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
+
+                /* Column 2 of MAT_B */
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
+                *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+                *(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
+                *(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
+
+                /* Column 3 of MAT_B */
+                *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
+                *(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
+                *(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
+                *(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
+
+                /* Column 4 of MAT_B */
+                *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
+                *(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
+                *(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
+                *(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
+
+                a_offset1 += 4;
+                a_offset2 += 4;
+                a_offset3 += 4;
+                a_offset4 += 4;
+                b_offset1 += ldb * 4;
+                b_offset2 += ldb * 4;
+                b_offset3 += ldb * 4;
+                b_offset4 += ldb * 4;
+                
+                j--;
+                } while (j > 0);
+            } // if(j > 0)
+
+
+            if (cols & 2) {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+                *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+
+                *(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
+                *(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
+
+                *(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
+                *(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
+
+                a_offset1 += 2;
+                a_offset2 += 2;
+                a_offset3 += 2;
+                a_offset4 += 2;
+
+                b_offset1 += ldb*2;
+
+            }
+
+            if (cols & 1) {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+
+                *(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
+
+                *(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
+            }
+
+            i--;
+        } while (i > 0);
+    }
+
+
+    if (rows & 2) {
+        a_offset1 = a_offset;
+        a_offset2 = a_offset1 + lda;
+        a_offset += 2 * lda;
+
+        b_offset1 = b_offset;
+        b_offset2 = b_offset1 + ldb;
+        b_offset3 = b_offset2 + ldb;
+        b_offset4 = b_offset3 + ldb;
+        b_offset += 2;
+
+        j = (cols >> 2);
+        if (j > 0){
+            do {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+                *(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
+                *(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
+
+                *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+                *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+                *(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
+                *(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
+                
+                a_offset1 += 4;
+                a_offset2 += 4;
+                b_offset1 += ldb * 4;
+                b_offset2 += ldb * 4;
+                b_offset3 += ldb * 4;
+                b_offset4 += ldb * 4;
+
+                j--;
+            } while (j > 0);
+        }
+
+
+        if (cols & 2){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+            *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+
+            *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+            *(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
+
+            a_offset1 += 2;
+            a_offset2 += 2;
+            b_offset1 += ldb*2;
+
+        }
+
+
+        if (cols & 1){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+            *(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
+        }
+    } // if (rows & 2)
+
+
+    if (rows & 1) {
+        a_offset1 = a_offset;
+        a_offset += lda;
+
+        b_offset1 = b_offset;
+        b_offset2 = b_offset1 + ldb;
+        b_offset3 = b_offset2 + ldb;
+        b_offset4 = b_offset3 + ldb;
+
+        j = (cols >> 2);
+        if (j > 0){
+            do {
+                *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+                *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+                *(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
+                *(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
+                
+                a_offset1 += 4;
+                b_offset1 += ldb * 4;
+                b_offset2 += ldb * 4;
+                b_offset3 += ldb * 4;
+                b_offset4 += ldb * 4;
+                
+                j--;
+            } while (j > 0);
+        }
+
+        if (cols & 2){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+            *(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
+
+            a_offset1 += 2;
+            b_offset1 += ldb * 2;
+        }
+
+        if (cols & 1){
+            *(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index d0cea9bee..37a92468f 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -11,7 +11,7 @@
 
 #if defined(SKYLAKEX)
 #include "sasum_microk_skylakex-2.c"
-#elif defined(HASWELL)
+#elif defined(HASWELL) || defined(ZEN)
 #include "sasum_microk_haswell-2.c"
 #endif
 
@@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
         mode = BLAS_DOUBLE | BLAS_REAL;
 #endif
-        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);
diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c
index 8e6cb9a47..2eb5b9538 100644
--- a/kernel/x86_64/sasum_microk_haswell-2.c
+++ b/kernel/x86_64/sasum_microk_haswell-2.c
@@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
         for (i = 0; i < tail_index_AVX2; i += 32) {
-            accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
-            accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
-            accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
-            accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
+            accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask);
+            accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask);
+            accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask);
+            accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask);
         }
 
         accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
@@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
         for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
         }
         
         accum_20 += accum_21;
diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c
index c8c69d1e0..fbc91b558 100644
--- a/kernel/x86_64/sasum_microk_skylakex-2.c
+++ b/kernel/x86_64/sasum_microk_skylakex-2.c
@@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
         __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
         for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
-            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
-            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+            accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2);
+            accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2);
         }
         
         accum_20 += accum_21;
diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c
index 7b2845636..ff911c52b 100644
--- a/kernel/x86_64/saxpy.c
+++ b/kernel/x86_64/saxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "saxpy_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "saxpy_microk_haswell-2.c"
-#elif defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "saxpy_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "saxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/sbdot.c b/kernel/x86_64/sbdot.c
index ef14fd618..a4e60b7c4 100644
--- a/kernel/x86_64/sbdot.c
+++ b/kernel/x86_64/sbdot.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined(COOPERLAKE)
+#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "sbdot_microk_cooperlake.c"
 #endif
 
diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c
index 067726cb1..2aefe46ff 100644
--- a/kernel/x86_64/sbdot_microk_cooperlake.c
+++ b/kernel/x86_64/sbdot_microk_cooperlake.c
@@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
         __m256 accum256_1 = _mm256_setzero_ps();
         int tail_index_32  = n&(~31);
         for (int j = 0; j < tail_index_32; j += 32) {
-            accum256   = _mm256_dpbf16_ps(accum256,   (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0]));
-            accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16]));
+            accum256   = _mm256_dpbf16_ps(accum256,   (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0]));
+            accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16]));
         }
         accum256 = _mm256_add_ps(accum256, accum256_1);
 
         /* Processing the remaining <32 chunk with 16-elements processing */
         if ((n&16) != 0) {
-            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32]));
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32]));
         }
         accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
 
         /* Processing the remaining <16 chunk with 8-elements processing */
         if ((n&8) != 0) {
             int tail_index_16  = n&(~15);
-            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
+            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
         }
 
         /* Processing the remaining <8 chunk with masked 8-elements processing */
@@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
     } else if (n > 15) { /* n range from 16 to 31 */
         /* Processing <32 chunk with 16-elements processing */
         __m256 accum256   = _mm256_setzero_ps();
-        accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0]));
+        accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0]));
         accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
 
         /* Processing the remaining <16 chunk with 8-elements processing */
         if ((n&8) != 0) {
             int tail_index_16  = n&(~15);
-            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
+            accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
         }
 
         /* Processing the remaining <8 chunk with masked 8-elements processing */
@@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
         }
     } else if (n > 7) { /* n range from 8 to 15 */
         /* Processing <16 chunk with 8-elements processing */
-        accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0]));
+        accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0]));
 
         /* Processing the remaining <8 chunk with masked 8-elements processing */
         if ((n&7) != 0) {
diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
new file mode 100644
index 000000000..b8c41f4f7
--- /dev/null
+++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c
@@ -0,0 +1,1871 @@
+#include <immintrin.h>
+
+// Walk around those intrinsics that missed by compiler
+#define MM256_LOADU_EPI16(addr)   \
+            _mm256_maskz_loadu_epi16(~0, (addr))
+#define MM256_STOREU_EPI16(addr, reg)  \
+            _mm256_mask_storeu_epi16((addr), ~0, (reg))
+
+// INCOPY Kernel, 16<M<=32, k can be any number
+void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    BLASLONG tag_k_2x = k & (~1);
+    unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-m));
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+
+    bfloat16 * src_addr0, * src_addr1;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG LDA_2x = 2*lda;
+    BLASLONG BF16_BLOCK_T_M_2x = 2*32;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32;
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1);
+        array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
+        array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+
+        src_addr0 += LDA_2x;
+        src_addr1 += LDA_2x;
+        dst_addr0 += BF16_BLOCK_T_M_2x;
+        dst_addr1 += BF16_BLOCK_T_M_2x;
+    }
+
+    if (tag_k_2x != k) {
+        __m512i ZERO512 = _mm512_setzero_si512();
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0);
+        array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
+        array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+    }
+}
+
+// INCOPY Kernel, 0<M<=16, k can be any number
+void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    BLASLONG tag_k_2x = k & (~1);
+    unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+
+    __m256i array256_0, array256_1, array256_2, array256_3;
+
+    bfloat16 * src_addr0, * src_addr1;
+    bfloat16 * dst_addr0;
+
+    BLASLONG LDA_2x = 2*lda;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    dst_addr0 = block_A;
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
+        array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0);
+        array256_1 = _mm256_maskz_loadu_epi16(tail_mask, src_addr1);
+        array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
+        array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
+        // Store in one row of block_B
+        MM256_STOREU_EPI16(dst_addr0,    array256_2);
+        MM256_STOREU_EPI16(dst_addr0+16, array256_3);
+
+        src_addr0 += LDA_2x;
+        src_addr1 += LDA_2x;
+        dst_addr0 += 32;
+    }
+
+    if (tag_k_2x != k) {
+        __m256i ZERO256 = _mm256_setzero_si256();
+        array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0);
+        array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
+        array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
+        // Store in one row of block_B
+        MM256_STOREU_EPI16(dst_addr0,    array256_2);
+        MM256_STOREU_EPI16(dst_addr0+16, array256_3);
+    }
+}
+
+// K=32, M=16
+void COL_MAJOR_ITCOPY_KERNEL_32x16(bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG LDA_4x = lda*4;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*8;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+    __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3;
+    __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3;
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    // Load and preprocess 1st 4 rows
+    array512_way0_0 = _mm512_loadu_si512(src_addr0);
+    array512_way0_1 = _mm512_loadu_si512(src_addr1);
+    array512_way0_2 = _mm512_loadu_si512(src_addr2);
+    array512_way0_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3);
+    array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+    src_addr0 += LDA_4x;
+    src_addr1 += LDA_4x;
+    src_addr2 += LDA_4x;
+    src_addr3 += LDA_4x;
+
+    // Load and preprocess 2nd 4 rows
+    array512_way1_0 = _mm512_loadu_si512(src_addr0);
+    array512_way1_1 = _mm512_loadu_si512(src_addr1);
+    array512_way1_2 = _mm512_loadu_si512(src_addr2);
+    array512_way1_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3);
+    array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+    src_addr0 += LDA_4x;
+    src_addr1 += LDA_4x;
+    src_addr2 += LDA_4x;
+    src_addr3 += LDA_4x;
+
+    // Load and preprocess 3rd 4 rows
+    array512_way2_0 = _mm512_loadu_si512(src_addr0);
+    array512_way2_1 = _mm512_loadu_si512(src_addr1);
+    array512_way2_2 = _mm512_loadu_si512(src_addr2);
+    array512_way2_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3);
+    array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+    src_addr0 += LDA_4x;
+    src_addr1 += LDA_4x;
+    src_addr2 += LDA_4x;
+    src_addr3 += LDA_4x;
+
+    // Load and preprocess 4th 4 rows
+    array512_way3_0 = _mm512_loadu_si512(src_addr0);
+    array512_way3_1 = _mm512_loadu_si512(src_addr1);
+    array512_way3_2 = _mm512_loadu_si512(src_addr2);
+    array512_way3_3 = _mm512_loadu_si512(src_addr3);
+    array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1);
+    array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1);
+    array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3);
+    array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3);
+    array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+    array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+    array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+    array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+    // Compose and store the 0/1 and 16/17 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 2/3 and 18/19 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 4/5 and 20/21 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 6/7 and 22/23 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 8/9 and 24/25 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 10/11 and 26/27 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 12/13 and 28/29 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+    dst_addr0 += 32;
+    dst_addr1 += 32;
+
+    // Compose and store the 14/15 and 30/31 cols
+    array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3);
+    array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3);
+    array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+    array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0);
+    _mm512_storeu_si512(dst_addr0, array512_2);
+    _mm512_storeu_si512(dst_addr1, array512_3);
+}
+
+// K=Any number but will be processed based on 32, M=32
+void COL_MAJOR_ITCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    BLASLONG LDA_4x  = lda*4;
+    BLASLONG LDA_8x  = lda*8;
+    BLASLONG LDA_12x = lda*12;
+    BLASLONG LDA_16x = lda*16;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*16;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+    __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3;
+    __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3;
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        for (int i = 0; i < 2; i++) {
+            // Load and preprocess 1st 4 rows
+            array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k);
+            array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k);
+            array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k);
+            array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3);
+            array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Load and preprocess 2nd 4 rows
+            array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k);
+            array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k);
+            array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k);
+            array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3);
+            array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Load and preprocess 3rd 4 rows
+            array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k);
+            array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k);
+            array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k);
+            array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3);
+            array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Load and preprocess 4th 4 rows
+            array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k);
+            array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k);
+            array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k);
+            array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1);
+            array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1);
+            array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3);
+            array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3);
+            array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+            // Compose and store the 0/1 and 16/17 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 2/3 and 18/19 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 4/5 and 20/21 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 6/7 and 22/23 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 8/9 and 24/25 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 10/11 and 26/27 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 12/13 and 28/29 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+
+            // Compose and store the 14/15 and 30/31 cols
+            array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3);
+            array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+
+            src_addr0 += LDA_16x;
+            src_addr1 += LDA_16x;
+            src_addr2 += LDA_16x;
+            src_addr3 += LDA_16x;
+            dst_addr0 -= (64*7 - 32);
+            dst_addr1 -= (64*7 - 32);
+        }
+        src_addr0 -= (LDA_16x*2);
+        src_addr1 -= (LDA_16x*2);
+        src_addr2 -= (LDA_16x*2);
+        src_addr3 -= (LDA_16x*2);
+        dst_addr0 += (32*30);
+        dst_addr1 += (32*30);
+    }
+
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+        __m512i array512[16];
+
+        bfloat16 * dst_addr_tmp = dst_addr0;
+
+        for (int i = 0; i < 2; i++) {
+            // Load and preprocess 1st 4 rows
+            array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]);
+            array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // Load and preprocess 2nd 4 rows
+            array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]);
+            array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]);
+            array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]);
+            array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]);
+            array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // Load and preprocess 3rd 4 rows
+            array512[8]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[9]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[8],  array512[9]);
+            array512_1 = _mm512_unpackhi_epi32(array512[8],  array512[9]);
+            array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]);
+            array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]);
+            array512[8]  = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[9]  = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // Load and preprocess 4th 4 rows
+            array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]);
+            array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]);
+            array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]);
+            array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]);
+            array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+
+            // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1;
+            // Half-compose of 0/1, 16/17, 8/9, 24/25 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]);
+            array512[0]  = array512_0;  // 1st 8 pairs of col 0/1,   and 1st 8 pairs of col 16/17
+            array512[4]  = array512_1;  // 2nd 8 pairs of col 0/1,   and 2nd 8 pairs of col 16/17
+            array512[8]  = array512_2;  // 1st 8 pairs of col 8/9,   and 1st 8 pairs of col 24/25
+            array512[12] = array512_3;  // 2nd 8 pairs of col 8/9,   and 2nd 8 pairs of col 24/25
+
+            // Half-compose of 2/3, 18/19, 10/11, 26/27 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]);
+            array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]);
+            array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]);
+            array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]);
+            array512[1]  = array512_0;  // 1st 8 pairs of col 2/3,   and 1st 8 pairs of col 18/19
+            array512[5]  = array512_1;  // 2nd 8 pairs of col 2/3,   and 2nd 8 pairs of col 18/19
+            array512[9]  = array512_2;  // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27
+            array512[13] = array512_3;  // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27
+
+            // Half-compose of 4/5, 20/21, 12/13, 28/29 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[2],  permute_lo_idx, array512[6]);
+            array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]);
+            array512_2 = _mm512_permutex2var_epi64(array512[2],  permute_hi_idx, array512[6]);
+            array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]);
+            array512[2]  = array512_0;  // 1st 8 pairs of col 4/5,   and 1st 8 pairs of col 20/21
+            array512[6]  = array512_1;  // 2nd 8 pairs of col 4/5,   and 2nd 8 pairs of col 20/21
+            array512[10] = array512_2;  // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29
+            array512[14] = array512_3;  // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29
+
+            // Half-compose of 6/7, 22/23, 14/15, 30/31 cols
+            array512_0 = _mm512_permutex2var_epi64(array512[3],  permute_lo_idx, array512[7]);
+            array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]);
+            array512_2 = _mm512_permutex2var_epi64(array512[3],  permute_hi_idx, array512[7]);
+            array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]);
+            array512[3]  = array512_0;  // 1st 8 pairs of col 6/7,   and 1st 8 pairs of col 22/23
+            array512[7]  = array512_1;  // 2nd 8 pairs of col 6/7,   and 2nd 8 pairs of col 22/23
+            array512[11] = array512_2;  // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31
+            array512[15] = array512_3;  // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31
+
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 2/3 cols
+            array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 4/5 cols
+            array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 6/7 cols
+            array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 8/9 cols
+            array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 10/11 cols
+            array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 12/13 cols
+            array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store the 14/15 cols
+            array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+
+            // Compose and store 16 ~ k_rem cols
+            int idx_length = (k_rem + 1 - 16) >> 1;
+            if (idx_length > 4) {
+                for (int idx_k = 0; idx_k < 4; idx_k++) {
+                    array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                    _mm512_storeu_si512(dst_addr0, array512_0);
+                    dst_addr0 += 64;
+                }
+
+                for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                    array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                    _mm512_storeu_si512(dst_addr0, array512_0);
+                    dst_addr0 += 64;
+                }
+            } else {
+                for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                    array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                    _mm512_storeu_si512(dst_addr0, array512_0);
+                    dst_addr0 += 64;
+                }
+            }
+
+            dst_addr0 = dst_addr_tmp + 32;
+        }
+    }
+}
+
+// K=Any number but will be processed based on 32, 16<M<32
+void COL_MAJOR_ITCOPY_KERNEL_Kx32m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    BLASLONG LDA_4x  = lda*4;
+
+    BLASLONG m_rem = m-16;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*16;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512[16];
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            // Load and preprocess 4 rows
+            array512[array_idx+0] = _mm512_loadu_si512(src_addr0+idx_k);
+            array512[array_idx+1] = _mm512_loadu_si512(src_addr1+idx_k);
+            array512[array_idx+2] = _mm512_loadu_si512(src_addr2+idx_k);
+            array512[array_idx+3] = _mm512_loadu_si512(src_addr3+idx_k);
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+        }
+
+        // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        dst_addr0 -= (64*8 - 32);
+        dst_addr1 -= (64*8 - 32);
+
+        for (int j = 0; j < m_rem; j++) {
+            array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k);
+        }
+        for (int j = m_rem; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 64;
+            dst_addr1 += 64;
+        }
+
+        src_addr0 -= (LDA_4x*4);
+        src_addr1 -= (LDA_4x*4);
+        src_addr2 -= (LDA_4x*4);
+        src_addr3 -= (LDA_4x*4);
+        dst_addr0 += (32*15);
+        dst_addr1 += (32*15);
+    }
+
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        int idx_length = (k_rem + 1 - 16) >> 1;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+        bfloat16 * dst_addr_tmp = dst_addr0;
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            // Load and preprocess 4 rows
+            array512[array_idx+0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+            array512[array_idx+1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+            array512[array_idx+2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+            array512[array_idx+3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+            src_addr0 += LDA_4x;
+            src_addr1 += LDA_4x;
+            src_addr2 += LDA_4x;
+            src_addr3 += LDA_4x;
+        }
+
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512[j+0]  = array512_0;  // 1st 8 pairs of col 0/1|2/3|4/5|6/7,   and 1st 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+4]  = array512_1;  // 2nd 8 pairs of col 0/1|2/3|4/5|6/7,   and 2nd 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+8]  = array512_2;  // 1st 8 pairs of col 8/9|10/11|12/13|14/15,   and 1st 8 pairs of col 24/25|26/27|28/29|30/31
+            array512[j+12] = array512_3;  // 2nd 8 pairs of col 8/9|10/11|12/13|14/15,   and 2nd 8 pairs of col 24/25|26/27|28/29|30/31
+        }
+
+        for (int j = 0; j < 4; j++) {
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        for (int j = 8; j < 12; j++) {
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        // Compose and store 16 ~ k_rem cols
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        }
+
+        dst_addr0 = dst_addr_tmp + 32;
+
+        for (int j = 0; j < m_rem; j++) {
+            array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x);
+        }
+        for (int j = m_rem; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512[j+0]  = array512_0;  // 1st 8 pairs of col 0/1|2/3|4/5|6/7,   and 1st 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+4]  = array512_1;  // 2nd 8 pairs of col 0/1|2/3|4/5|6/7,   and 2nd 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+8]  = array512_2;  // 1st 8 pairs of col 8/9|10/11|12/13|14/15,   and 1st 8 pairs of col 24/25|26/27|28/29|30/31
+            array512[j+12] = array512_3;  // 2nd 8 pairs of col 8/9|10/11|12/13|14/15,   and 2nd 8 pairs of col 24/25|26/27|28/29|30/31
+        }
+
+        for (int j = 0; j < 4; j++) {
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        for (int j = 8; j < 12; j++) {
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 64;
+        }
+
+        // Compose and store 16 ~ k_rem cols
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 64;
+            }
+        }
+    }
+}
+
+// K=Any number but will be processed based on 32, M=16
+void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    BLASLONG LDA_4x  = lda*4;
+    BLASLONG LDA_8x  = lda*8;
+    BLASLONG LDA_12x = lda*12;
+
+    src_addr0 = A;
+    src_addr1 = A + lda;
+    src_addr2 = A + lda*2;
+    src_addr3 = A + lda*3;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*8;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+    __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3;
+    __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3;
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load and preprocess 1st 4 rows
+        array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k);
+        array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k);
+        array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k);
+        array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3);
+        array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Load and preprocess 2nd 4 rows
+        array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k);
+        array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k);
+        array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k);
+        array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3);
+        array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Load and preprocess 3rd 4 rows
+        array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k);
+        array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k);
+        array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k);
+        array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3);
+        array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Load and preprocess 4th 4 rows
+        array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k);
+        array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k);
+        array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k);
+        array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k);
+        array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1);
+        array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1);
+        array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3);
+        array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3);
+        array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // Compose and store the 0/1 and 16/17 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 2/3 and 18/19 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 4/5 and 20/21 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 6/7 and 22/23 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 8/9 and 24/25 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 10/11 and 26/27 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 12/13 and 28/29 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32;
+        dst_addr1 += 32;
+
+        // Compose and store the 14/15 and 30/31 cols
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3);
+        array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3);
+        array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+        array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+        _mm512_storeu_si512(dst_addr0, array512_2);
+        _mm512_storeu_si512(dst_addr1, array512_3);
+        dst_addr0 += 32*9;
+        dst_addr1 += 32*9;
+    }
+
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+        __m512i array512[16];
+
+        // Load and preprocess 1st 4 rows
+        array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]);
+        array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]);
+        array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]);
+        array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]);
+        array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        src_addr0 += LDA_4x;
+        src_addr1 += LDA_4x;
+        src_addr2 += LDA_4x;
+        src_addr3 += LDA_4x;
+
+        // Load and preprocess 2nd 4 rows
+        array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]);
+        array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]);
+        array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]);
+        array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]);
+        array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        src_addr0 += LDA_4x;
+        src_addr1 += LDA_4x;
+        src_addr2 += LDA_4x;
+        src_addr3 += LDA_4x;
+
+        // Load and preprocess 3rd 4 rows
+        array512[8]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[9]  = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[8],  array512[9]);
+        array512_1 = _mm512_unpackhi_epi32(array512[8],  array512[9]);
+        array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]);
+        array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]);
+        array512[8]  = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[9]  = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        src_addr0 += LDA_4x;
+        src_addr1 += LDA_4x;
+        src_addr2 += LDA_4x;
+        src_addr3 += LDA_4x;
+
+        // Load and preprocess 4th 4 rows
+        array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+        array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]);
+        array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]);
+        array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]);
+        array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]);
+        array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2);
+        array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2);
+        array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3);
+        array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3);
+
+        // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1;
+        // Half-compose of 0/1, 16/17, 8/9, 24/25 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]);
+        array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]);
+        array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]);
+        array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]);
+        array512[0]  = array512_0;  // 1st 8 pairs of col 0/1,   and 1st 8 pairs of col 16/17
+        array512[4]  = array512_1;  // 2nd 8 pairs of col 0/1,   and 2nd 8 pairs of col 16/17
+        array512[8]  = array512_2;  // 1st 8 pairs of col 8/9,   and 1st 8 pairs of col 24/25
+        array512[12] = array512_3;  // 2nd 8 pairs of col 8/9,   and 2nd 8 pairs of col 24/25
+
+        // Half-compose of 2/3, 18/19, 10/11, 26/27 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]);
+        array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]);
+        array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]);
+        array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]);
+        array512[1]  = array512_0;  // 1st 8 pairs of col 2/3,   and 1st 8 pairs of col 18/19
+        array512[5]  = array512_1;  // 2nd 8 pairs of col 2/3,   and 2nd 8 pairs of col 18/19
+        array512[9]  = array512_2;  // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27
+        array512[13] = array512_3;  // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27
+
+        // Half-compose of 4/5, 20/21, 12/13, 28/29 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[2],  permute_lo_idx, array512[6]);
+        array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]);
+        array512_2 = _mm512_permutex2var_epi64(array512[2],  permute_hi_idx, array512[6]);
+        array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]);
+        array512[2]  = array512_0;  // 1st 8 pairs of col 4/5,   and 1st 8 pairs of col 20/21
+        array512[6]  = array512_1;  // 2nd 8 pairs of col 4/5,   and 2nd 8 pairs of col 20/21
+        array512[10] = array512_2;  // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29
+        array512[14] = array512_3;  // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29
+
+        // Half-compose of 6/7, 22/23, 14/15, 30/31 cols
+        array512_0 = _mm512_permutex2var_epi64(array512[3],  permute_lo_idx, array512[7]);
+        array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]);
+        array512_2 = _mm512_permutex2var_epi64(array512[3],  permute_hi_idx, array512[7]);
+        array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]);
+        array512[3]  = array512_0;  // 1st 8 pairs of col 6/7,   and 1st 8 pairs of col 22/23
+        array512[7]  = array512_1;  // 2nd 8 pairs of col 6/7,   and 2nd 8 pairs of col 22/23
+        array512[11] = array512_2;  // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31
+        array512[15] = array512_3;  // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31
+
+        // Compose and store the 0/1 cols
+        array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 2/3 cols
+        array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 4/5 cols
+        array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 6/7 cols
+        array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 8/9 cols
+        array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 10/11 cols
+        array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 12/13 cols
+        array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store the 14/15 cols
+        array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1);
+        _mm512_storeu_si512(dst_addr0, array512_0);
+        dst_addr0 += 32;
+
+        // Compose and store 16 ~ k_rem cols
+        int idx_length = (k_rem + 1 - 16) >> 1;
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        }
+    }
+}
+
+// K=Any number but will be processed based on 32, M<=16
+void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
+{
+    bfloat16 * src_addr0;
+    bfloat16 * dst_addr0, * dst_addr1;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+    src_addr0 = A;
+    dst_addr0 = block_A;
+    dst_addr1 = block_A + 32*8;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512[16];
+
+    __m512i M512_EPI64_2   = _mm512_set1_epi64(2);
+    __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0);
+    __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2);
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        for (int j = 0; j < m; j++) {
+            array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k);
+        }
+        for (int j = m; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 32;
+            dst_addr1 += 32;
+        }
+
+        // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1);
+            array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0);
+            _mm512_storeu_si512(dst_addr0, array512_2);
+            _mm512_storeu_si512(dst_addr1, array512_3);
+            dst_addr0 += 32;
+            dst_addr1 += 32;
+        }
+
+        dst_addr0 += 32*8;
+        dst_addr1 += 32*8;
+    }
+
+    if (tag_k_32x != k) {
+        int k_rem = k - tag_k_32x;
+        unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem));
+
+        for (int j = 0; j < m; j++) {
+            array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x);
+        }
+        for (int j = m; j < 16; j++) {
+            array512[j] = _mm512_setzero_si512();
+        }
+
+        for (int j = 0; j < 4; j++) {
+            int array_idx = j*4;
+            array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]);
+            array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]);
+            array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2);
+            array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2);
+            array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3);
+            array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3);
+        }
+
+        for (int j = 0; j < 4; j++) {
+            array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]);
+            array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]);
+            array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]);
+            array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]);
+            array512[j+0]  = array512_0;  // 1st 8 pairs of col 0/1|2/3|4/5|6/7,   and 1st 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+4]  = array512_1;  // 2nd 8 pairs of col 0/1|2/3|4/5|6/7,   and 2nd 8 pairs of col 16/17|18/19|20/21|22/23
+            array512[j+8]  = array512_2;  // 1st 8 pairs of col 8/9|10/11|12/13|14/15,   and 1st 8 pairs of col 24/25|26/27|28/29|30/31
+            array512[j+12] = array512_3;  // 2nd 8 pairs of col 8/9|10/11|12/13|14/15,   and 2nd 8 pairs of col 24/25|26/27|28/29|30/31
+        }
+
+        for (int j = 0; j < 4; j++) {
+            // Compose and store the 0/1 cols
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 32;
+        }
+
+        for (int j = 8; j < 12; j++) {
+            array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1);
+            _mm512_storeu_si512(dst_addr0, array512_0);
+            dst_addr0 += 32;
+        }
+
+        // Compose and store 16 ~ k_rem cols
+        int idx_length = (k_rem + 1 - 16) >> 1;
+        if (idx_length > 4) {
+            for (int idx_k = 0; idx_k < 4; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+
+            for (int idx_k = 4; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        } else {
+            for (int idx_k = 0; idx_k < idx_length; idx_k++) {
+                array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0);
+                _mm512_storeu_si512(dst_addr0, array512_0);
+                dst_addr0 += 32;
+            }
+        }
+    }
+}
+
+// COL_MAJOR_ONCOPY_KERNEL_16x32 behaves exactly the same as COL_MAJOR_ITCOPY_KERNEL_Kx16
+#define COL_MAJOR_ONCOPY_KERNEL_16x32 COL_MAJOR_ITCOPY_KERNEL_Kx16
+
+void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_32x = k & (~31);
+
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3, * src_addr4, * src_addr5, * src_addr6, * src_addr7;
+    bfloat16 * dst_addr0;
+
+    unsigned char blend_mask = (((unsigned char)0xcc));
+    __m512i permute_idx = _mm512_set_epi64(13, 12, 7, 6, 9, 8, 3, 2);
+
+    src_addr0 = B;
+    src_addr1 = src_addr0 + 1*ldb;
+    src_addr2 = src_addr0 + 2*ldb;
+    src_addr3 = src_addr0 + 3*ldb;
+    src_addr4 = src_addr0 + 4*ldb;
+    src_addr5 = src_addr0 + 5*ldb;
+    src_addr6 = src_addr0 + 6*ldb;
+    src_addr7 = src_addr0 + 7*ldb;
+    dst_addr0 = block_B;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+    __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3;
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        array512_0 = _mm512_loadu_si512(src_addr0+idx_k);
+        array512_1 = _mm512_loadu_si512(src_addr1+idx_k);
+        array512_2 = _mm512_loadu_si512(src_addr2+idx_k);
+        array512_3 = _mm512_loadu_si512(src_addr3+idx_k);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_loadu_si512(src_addr4+idx_k);
+        array512_1 = _mm512_loadu_si512(src_addr5+idx_k);
+        array512_2 = _mm512_loadu_si512(src_addr6+idx_k);
+        array512_3 = _mm512_loadu_si512(src_addr7+idx_k);
+
+        array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3);
+
+        array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22);
+        array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77);
+        array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22);
+        array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77);
+
+        array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0);
+        array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1);
+        array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2);
+        array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1);
+        array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2);
+        array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0+128, array512_0);
+        _mm512_storeu_si512(dst_addr0+160, array512_1);
+        _mm512_storeu_si512(dst_addr0+192, array512_2);
+        _mm512_storeu_si512(dst_addr0+224, array512_3);
+
+        dst_addr0 += 256;
+    }
+
+    if (tag_k_32x != k) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr4+tag_k_32x);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr5+tag_k_32x);
+        array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr6+tag_k_32x);
+        array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr7+tag_k_32x);
+
+        array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3);
+
+        array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22);
+        array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77);
+        array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22);
+        array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77);
+
+
+        array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0);
+        array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1);
+        array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2);
+        array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+
+        array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0);
+        array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1);
+        array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2);
+        array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3);
+        _mm512_storeu_si512(dst_addr0+128, array512_0);
+        _mm512_storeu_si512(dst_addr0+160, array512_1);
+        _mm512_storeu_si512(dst_addr0+192, array512_2);
+        _mm512_storeu_si512(dst_addr0+224, array512_3);
+    }
+}
+
+void COL_MAJOR_ONCOPY_KERNEL_4x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_32x = k & (~31);
+
+    bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
+    bfloat16 * dst_addr0;
+
+    src_addr0 = B;
+    src_addr1 = src_addr0 + 1*ldb;
+    src_addr2 = src_addr0 + 2*ldb;
+    src_addr3 = src_addr0 + 3*ldb;
+    dst_addr0 = block_B;
+
+    __m512i array512_0, array512_1, array512_2, array512_3;
+    __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3;
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        array512_0 = _mm512_loadu_si512(src_addr0+idx_k);
+        array512_1 = _mm512_loadu_si512(src_addr1+idx_k);
+        array512_2 = _mm512_loadu_si512(src_addr2+idx_k);
+        array512_3 = _mm512_loadu_si512(src_addr3+idx_k);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88);
+        array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88);
+        array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd);
+        array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd);
+
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+
+        dst_addr0 += 128;
+    }
+
+    if (tag_k_32x != k) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x);
+        array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x);
+        array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x);
+        array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x);
+
+        array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1);
+        array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1);
+        array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3);
+        array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3);
+
+        array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2);
+        array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2);
+        array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3);
+        array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3);
+
+        array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88);
+        array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd);
+        array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88);
+        array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd);
+
+        array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88);
+        array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88);
+        array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd);
+        array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd);
+
+        _mm512_storeu_si512(dst_addr0,    array512_0);
+        _mm512_storeu_si512(dst_addr0+32, array512_1);
+        _mm512_storeu_si512(dst_addr0+64, array512_2);
+        _mm512_storeu_si512(dst_addr0+96, array512_3);
+    }
+}
+
+void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_32x = k & (~31);
+    BLASLONG tag_n_2x  = n & (~1);
+
+    bfloat16 * src_addr0;
+    bfloat16 * dst_addr0;
+
+    BLASLONG LDB_2x = 2*ldb;
+
+    src_addr0 = B;
+    dst_addr0 = block_B;
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        src_addr0 = B;
+        for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
+            _mm512_storeu_si512(dst_addr0,      _mm512_loadu_si512(src_addr0 + idx_k));
+            _mm512_storeu_si512(dst_addr0 + 32, _mm512_loadu_si512(src_addr0 + ldb + idx_k));
+            src_addr0 += LDB_2x;
+            dst_addr0 += 64;
+        }
+        
+        if (tag_n_2x != n) {
+            _mm512_storeu_si512(dst_addr0,  _mm512_loadu_si512(src_addr0 + idx_k));
+            dst_addr0 += 32;
+        }
+    }
+
+    if (tag_k_32x != k) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        src_addr0 = B;
+        for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
+            _mm512_storeu_si512(dst_addr0,      _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x));
+            _mm512_storeu_si512(dst_addr0 + 32, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + ldb + tag_k_32x));
+            src_addr0 += LDB_2x;
+            dst_addr0 += 64;
+        }
+        
+        if (tag_n_2x != n) {
+            _mm512_storeu_si512(dst_addr0,  _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x));
+        }
+    }
+}
+
+void COL_MAJOR_OTCOPY_KERNEL_Kx8(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_2x = k & (~1);
+    unsigned char tail_mask_value = (unsigned char) 0xff;
+    __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+
+    __m128i array128_0, array128_1, array128_2, array128_3;
+
+    BLASLONG idx_src_base0, idx_src_base1;
+    BLASLONG idx_target_base0, idx_target_base1;
+
+    BLASLONG LDA_2x = 2*ldb;
+    BLASLONG BF16_BLOCK_T_M_2x = 2*8;
+    idx_src_base0 = 0;
+    idx_src_base1 = ldb;
+    idx_target_base0 = 0;
+    idx_target_base1 = 8;
+    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, array128_1);
+        array128_3 = _mm_unpackhi_epi16(array128_0, array128_1);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+
+        idx_src_base0 += LDA_2x;
+        idx_src_base1 += LDA_2x;
+        idx_target_base0 += BF16_BLOCK_T_M_2x;
+        idx_target_base1 += BF16_BLOCK_T_M_2x;
+    }
+
+    if (tag_k_2x != k) {
+        __m128i ZERO128 = _mm_setzero_si128();
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128);
+        array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+   }
+}
+
+void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
+{
+    BLASLONG tag_k_2x = k & (~1);
+    unsigned char tail_mask = (((unsigned char)0xff) >> (8-n));
+
+    __m128i array128_0, array128_1, array128_2, array128_3;
+
+    BLASLONG idx_src_base0, idx_src_base1;
+    BLASLONG idx_target_base0, idx_target_base1;
+
+    BLASLONG LDA_2x = 2*ldb;
+    BLASLONG BF16_BLOCK_T_M_2x = 2*8;
+    idx_src_base0 = 0;
+    idx_src_base1 = ldb;
+    idx_target_base0 = 0;
+    idx_target_base1 = 8;
+    for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, array128_1);
+        array128_3 = _mm_unpackhi_epi16(array128_0, array128_1);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+
+        idx_src_base0 += LDA_2x;
+        idx_src_base1 += LDA_2x;
+        idx_target_base0 += BF16_BLOCK_T_M_2x;
+        idx_target_base1 += BF16_BLOCK_T_M_2x;
+    }
+
+    if (tag_k_2x != k) {
+        __m128i ZERO128 = _mm_setzero_si128();
+        array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]);
+        array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128);
+        array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128);
+        _mm_storeu_epi32(&block_B[idx_target_base0], array128_2);
+        _mm_storeu_epi32(&block_B[idx_target_base1], array128_3);
+   }
+}
+
+// Scale matrix C when beta is not ZERO or ONE
+void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc)
+{
+    float * C_addr0 = C;
+    float * C_addr1 = C + ldc;
+    float * C_addr2 = C + ldc*2;
+    float * C_addr3 = C + ldc*3;
+
+    BLASLONG LDC4x = ldc*4;
+
+    __m512 array_512_0, array_512_1, array_512_2, array_512_3;
+    __m512 BETAVECTOR  = _mm512_set1_ps(beta);
+
+    BLASLONG tag_n_Nx = N & (~3);
+    BLASLONG tag_n_Mx = M & (~15);
+    unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
+    for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
+        for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+            array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m);
+            array_512_1 = _mm512_loadu_ps(C_addr1 + idx_m);
+            array_512_2 = _mm512_loadu_ps(C_addr2 + idx_m);
+            array_512_3 = _mm512_loadu_ps(C_addr3 + idx_m);
+
+            array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+            array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
+            array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
+            array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
+
+            _mm512_storeu_ps(C_addr0 + idx_m, array_512_0);
+            _mm512_storeu_ps(C_addr1 + idx_m, array_512_1);
+            _mm512_storeu_ps(C_addr2 + idx_m, array_512_2);
+            _mm512_storeu_ps(C_addr3 + idx_m, array_512_3);
+        }
+
+        if (tag_n_Mx != M) {
+            array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx);
+            array_512_1 = _mm512_maskz_loadu_ps(tail_mask, C_addr1 + tag_n_Mx);
+            array_512_2 = _mm512_maskz_loadu_ps(tail_mask, C_addr2 + tag_n_Mx);
+            array_512_3 = _mm512_maskz_loadu_ps(tail_mask, C_addr3 + tag_n_Mx);
+
+            array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+            array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
+            array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
+            array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
+
+            _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0);
+            _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, array_512_1);
+            _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, array_512_2);
+            _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, array_512_3);
+        }
+
+        C_addr0 += LDC4x;
+        C_addr1 += LDC4x;
+        C_addr2 += LDC4x;
+        C_addr3 += LDC4x;
+    }
+
+    if (tag_n_Nx != N) {
+        for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
+            for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+                array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m);
+                array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+                _mm512_storeu_ps(C_addr0 + idx_m, array_512_0);
+            }
+
+            if (tag_n_Mx != M) {
+                array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx);
+                array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
+                _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0);
+            }
+            C_addr0 += ldc;
+        }
+    }
+}
+
+// Zero C matrix when Beta is 0
+void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc)
+{
+    float * C_addr0 = C;
+    float * C_addr1 = C + ldc;
+    float * C_addr2 = C + ldc*2;
+    float * C_addr3 = C + ldc*3;
+
+    BLASLONG LDC4x = ldc*4;
+
+    __m512  ZEROVECTOR  = _mm512_setzero_ps();
+
+    BLASLONG tag_n_Nx = N & (~3);
+    BLASLONG tag_n_Mx = M & (~15);
+    unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
+    for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
+        for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+            _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR);
+            _mm512_storeu_ps(C_addr1 + idx_m, ZEROVECTOR);
+            _mm512_storeu_ps(C_addr2 + idx_m, ZEROVECTOR);
+            _mm512_storeu_ps(C_addr3 + idx_m, ZEROVECTOR);
+        }
+
+        if (tag_n_Mx != M) {
+            _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, ZEROVECTOR);
+        }
+
+        C_addr0 += LDC4x;
+        C_addr1 += LDC4x;
+        C_addr2 += LDC4x;
+        C_addr3 += LDC4x;
+    }
+
+    if (tag_n_Nx != N) {
+        for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
+            for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
+                _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR);
+            }
+
+            if (tag_n_Mx != M) {
+                _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR);
+            }
+            C_addr0 += ldc;
+        }
+    }
+}
diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c
new file mode 100644
index 000000000..955db3163
--- /dev/null
+++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c
@@ -0,0 +1,50 @@
+/***************************************************************************
+ * Copyright (c) 2021, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include "common.h"
+
+#define ALPHA_ONE
+#include "sbgemm_kernel_16x16_spr_tmpl.c"
+#undef ALPHA_ONE
+#include "sbgemm_kernel_16x16_spr_tmpl.c"
+
+
+int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOAT * iB, FLOAT * C, BLASLONG ldc)
+{
+	/* transport to Row Major matrix for AMX requirement */
+	BLASLONG m, n;
+	IFLOAT *A, *B;
+	m = in;
+	n = im;
+	A = iB;
+	B = iA;
+
+	if (alpha == 1.0f)
+		return sbgemm_kernel_spr_alpha_one(m, n, k, alpha, A, B, C, ldc);
+	else
+		return sbgemm_kernel_spr_alpha(m, n, k, alpha, A, B, C, ldc);
+}
diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
new file mode 100644
index 000000000..90e0a32c7
--- /dev/null
+++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
@@ -0,0 +1,530 @@
+/***************************************************************************
+ * Copyright (c) 2021, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include <immintrin.h>
+#include <string.h>
+#include "common.h"
+
+#ifndef SBGEMM_KERNEL_SPR
+#define SBGEMM_KERNEL_SPR
+typedef struct {
+	char palette_id;
+	char start_row;
+	char dummy0[14];  // bytes 2-15 reserved, must be zero
+	short tile_colsb[8];
+	char dummy1[16];  // bytes 32-47 reserved, must be zero
+	char tile_rows[8];
+	char dummy2[16];  // bytes 56-63 reserved, must be zero
+} tilecfg;
+
+/* tile0/tile1 -- A (m x 2k)
+ * tile2/tile3 -- B (2k x n)
+ * tile4-7 -- C (m x n)
+ */
+#define TCONF(cfg, m, n, k2) \
+	memset(&cfg, 0, sizeof(tilecfg)); \
+	cfg.palette_id = 1; \
+	cfg.tile_rows[0] = m; \
+	cfg.tile_rows[1] = m; \
+	cfg.tile_rows[2] = k2>>1; \
+	cfg.tile_rows[3] = k2>>1; \
+	cfg.tile_rows[4] = m; \
+	cfg.tile_rows[5] = m; \
+	cfg.tile_rows[6] = m; \
+	cfg.tile_rows[7] = m; \
+	cfg.tile_colsb[0] = k2<<1; \
+	cfg.tile_colsb[1] = k2<<1; \
+	cfg.tile_colsb[2] = n * 4; \
+	cfg.tile_colsb[3] = n * 4; \
+	cfg.tile_colsb[4] = n * 4; \
+	cfg.tile_colsb[5] = n * 4; \
+	cfg.tile_colsb[6] = n * 4; \
+	cfg.tile_colsb[7] = n * 4; \
+	_tile_loadconfig(&cfg);
+
+/* CONFIG for handling k2 and odd tail at the same time
+ * tile0 -- A (m x 2k)
+ * tile1 -- A (m x 1)
+ * tile2 -- B (2k x n)
+ * tile3 -- B (1 x n)
+ * tile4 -- C (m x n)
+ */
+#define TCONF_TAIL(cfg, m, n, k2) \
+	memset(&cfg, 0, sizeof(tilecfg)); \
+	cfg.palette_id = 1; \
+	cfg.tile_rows[0] = m; \
+	cfg.tile_rows[1] = m; \
+	cfg.tile_rows[2] = k2>>1; \
+	cfg.tile_rows[3] = 1; \
+	cfg.tile_rows[4] = m; \
+	cfg.tile_colsb[0] = k2<<1; \
+	cfg.tile_colsb[1] = 4; \
+	cfg.tile_colsb[2] = n * 4; \
+	cfg.tile_colsb[3] = n * 4; \
+	cfg.tile_colsb[4] = n * 4; \
+	_tile_loadconfig(&cfg);
+
+#define T_A0	0
+#define T_A1	1
+#define T_B0	2
+#define T_B1	3
+#define T_C00	4
+#define T_C01	5
+#define T_C10	6
+#define T_C11	7
+
+// FIXME: gcc11 seem have problem in tile load/store address calc,
+// need to multiply with element size (2 or 4) here.
+#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2)
+#define LOAD_A_TAIL(M, N) {\
+	__m256i ymm = _mm256_loadu_epi16(ptr_a##M); \
+	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
+	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
+	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+}
+#define MASK_LOAD_A_TAIL(M, N) {\
+	__m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \
+	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
+	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
+	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+}
+#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2)
+#define LOAD_B_TAIL(M, N) {\
+	__m256i ymm = _mm256_loadu_epi16(ptr_b##N); \
+	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
+	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
+	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+}
+#define MASK_LOAD_B_TAIL(M, N) {\
+	__m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \
+	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
+	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
+	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+}
+
+#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)
+#define MATMUL_TAIL(M, N) _tile_dpbf16ps(T_C00, T_A##M, T_B##N)
+#define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4)
+#define LOAD_C_F(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4)
+
+#endif  // end of SBGEMM_KERNEL_SPR
+
+#ifdef ALPHA_ONE
+#undef LOAD_C
+#define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4)
+#else
+#undef LOAD_C
+#define LOAD_C(M, N) _tile_zero(T_C##M##N)
+#define ALPHA_STORE(N) \
+	__m512 zmm_d##N = _mm512_loadu_ps(dst##N + noffset); \
+	__m512 zmm_s##N = _mm512_loadu_ps(src##N + noffset); \
+	zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \
+	_mm512_storeu_ps(dst##N + noffset, zmm_d##N);
+#define MASK_APLPHA_STORE(N) \
+	__m512 zmm_d##N = _mm512_maskz_loadu_ps(mask, dst##N + noffset); \
+	__m512 zmm_s##N = _mm512_maskz_loadu_ps(mask, src##N + noffset); \
+	zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \
+	_mm512_mask_storeu_ps(dst##N + noffset, mask, zmm_d##N);
+#endif // end of ALPHA_ONE
+
+
+#ifdef ALPHA_ONE
+int sbgemm_kernel_spr_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+#else
+int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	/* Row Major matrix for AMX requirement */
+	IFLOAT *ptr_a = A, *ptr_b = B;
+	IFLOAT *ptr_b0, *ptr_b1;
+	IFLOAT *ptr_a0, *ptr_a1;
+	FLOAT *ptr_c = C;
+	FLOAT *ptr_c00, *ptr_c01, *ptr_c10, *ptr_c11;
+
+	BLASLONG lda, ldb;
+	BLASLONG m_count = m;
+	BLASLONG n_count, k_count;
+
+#ifndef ALPHA_ONE
+	// make sure each row is 64 bytes aligned
+	BLASLONG cn = (n & 31) ? (n & ~31) + 32 : n;
+	FLOAT *raw_tmp_c;
+	if (k < 32) {
+		// only need to zero buff in this situation
+		raw_tmp_c = (FLOAT *)calloc(1, sizeof(FLOAT) * m * cn + 64);
+	} else {
+		raw_tmp_c = (FLOAT *)malloc(sizeof(FLOAT) * m * cn + 64);
+	}
+	// align buf to 64 byte boundary
+	FLOAT *tmp_c = (FLOAT *)(((uintptr_t) raw_tmp_c + 63) & ~(uintptr_t)63);
+	ptr_c = tmp_c;
+	BLASLONG ldc_o = ldc;
+	ldc = cn;
+#endif
+	IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64)));
+	IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64)));
+	tilecfg cfg;
+
+	if (k > 31) {
+		for (; m_count > 31; m_count -= 32) {
+			ptr_b = B;
+
+			ptr_c00 = ptr_c;
+			ptr_c01 = ptr_c00 + 16;
+			ptr_c10 = ptr_c + 16 * ldc;
+			ptr_c11 = ptr_c10 + 16;
+			ptr_c += 32 * ldc;
+			n_count = n;
+			TCONF(cfg, 16, 16, 32);
+			for (; n_count > 31; n_count -= 32) {
+				ptr_a0 = ptr_a;
+				ptr_a1 = ptr_a + 16 * k;
+
+				ptr_b0 = ptr_b;
+				ptr_b1 = ptr_b + 16 * k;
+				ptr_b += 32 * k;
+
+				lda = 32;
+				ldb = 32;
+				LOAD_C(0, 0); LOAD_C(0, 1);
+				LOAD_C(1, 0); LOAD_C(1, 1);
+				k_count = k;
+				for (; k_count > 31; k_count -= 32) {
+					LOAD_A(0, x); LOAD_A(1, x);
+					ptr_a0 += 16 * 32;
+					ptr_a1 += 16 * 32;
+					LOAD_B(x, 0); LOAD_B(x, 1);
+					ptr_b0 += 16 * 32;
+					ptr_b1 += 16 * 32;
+
+					MATMUL(0, 0); MATMUL(0, 1);
+					MATMUL(1, 0); MATMUL(1, 1);
+				}
+				STORE_C(0, 0); STORE_C(0, 1);
+				STORE_C(1, 0); STORE_C(1, 1);
+				ptr_c00 += 32;
+				ptr_c01 += 32;
+				ptr_c10 += 32;
+				ptr_c11 += 32;
+			}
+			for (; n_count > 0; n_count -= 16) {
+				int tail_n = (n_count > 16) ? 16: n_count;
+				ptr_a0 = ptr_a;
+				ptr_a1 = ptr_a + 16 * k;
+
+				ptr_b0 = ptr_b;
+				ptr_b += tail_n * k;
+
+				lda = 32;
+				ldb = 2 * tail_n;
+				TCONF(cfg, 16, tail_n, 32);
+				LOAD_C(0, 0);
+				LOAD_C(1, 0);
+				k_count = k;
+				for (; k_count > 31; k_count -= 32) {
+					LOAD_A(0, x); LOAD_A(1, x);
+					ptr_a0 += 16 * 32;
+					ptr_a1 += 16 * 32;
+					LOAD_B(x, 0);
+					ptr_b0 += tail_n * 32;
+
+					MATMUL(0, 0);
+					MATMUL(1, 0);
+				}
+				STORE_C(0, 0);
+				STORE_C(1, 0);
+				ptr_c00 += tail_n;
+				ptr_c10 += tail_n;
+			}
+			ptr_a += 32 * k;
+		}
+		for (; m_count > 0; m_count -= 16) {
+			// process at most 16 m at a time
+			int tail_m = (m_count > 16) ? 16: m_count;
+
+			ptr_b = B;
+
+			ptr_c00 = ptr_c;
+			ptr_c01 = ptr_c00 + 16;
+			ptr_c += tail_m * ldc;
+			n_count = n;
+			TCONF(cfg, tail_m, 16, 32);
+			for (; n_count > 31; n_count -= 32) {
+				ptr_a0 = ptr_a;
+
+				ptr_b0 = ptr_b;
+				ptr_b1 = ptr_b + 16 * k;
+				ptr_b += 32 * k;
+
+				lda = 32;
+				ldb = 32;
+				LOAD_C(0, 0); LOAD_C(0, 1);
+				k_count = k;
+				for (; k_count > 31; k_count -= 32) {
+					LOAD_A(0, x);
+					ptr_a0 += tail_m * 32;
+					LOAD_B(x, 0); LOAD_B(x, 1);
+					ptr_b0 += 16 * 32;
+					ptr_b1 += 16 * 32;
+
+					MATMUL(0, 0); MATMUL(0, 1);
+				}
+				STORE_C(0, 0); STORE_C(0, 1);
+				ptr_c00 += 32;
+				ptr_c01 += 32;
+			}
+			for (; n_count > 0; n_count -= 16) {
+				int tail_n = (n_count > 16) ? 16: n_count;
+				ptr_a0 = ptr_a;
+
+				ptr_b0 = ptr_b;
+				ptr_b += tail_n * k;
+
+				lda = 32;
+				ldb = 2 * tail_n;
+				TCONF(cfg, tail_m, tail_n, 32);
+				LOAD_C(0, 0);
+				k_count = k;
+				for (; k_count > 31; k_count -= 32) {
+					LOAD_A(0, x);
+					ptr_a0 += tail_m * 32;
+					LOAD_B(x, 0);
+					ptr_b0 += tail_n * 32;
+
+					MATMUL(0, 0);
+				}
+				STORE_C(0, 0);
+				ptr_c00 += tail_n;
+			}
+			ptr_a += tail_m * k;
+		}
+	}
+
+	// process for k < 32
+	BLASLONG k32 = k & ~31;
+	BLASLONG k2 = k & ~1;
+	if (k32 != k) {
+		int remain_k2 = k2 - k32;
+		m_count = m;
+		ptr_a = A;
+#ifndef ALPHA_ONE
+		ptr_c = tmp_c;
+#else
+		ptr_c = C;
+#endif
+		if (remain_k2 > 0 && k2 != k) { // k%32 = 2x + 1 (x != 0)
+			for (; m_count > 0; m_count -= 16) {
+				int tail_m = (m_count > 16) ? 16: m_count;
+				__mmask16 amask = (1UL << tail_m) - 1;
+
+				ptr_a0 = ptr_a + tail_m * k32;
+				ptr_a1 = ptr_a + tail_m * k2;
+				ptr_a += tail_m * k;
+				ptr_b = B;
+				ptr_c00 = ptr_c;
+				ptr_c += tail_m * ldc;
+				n_count = n;
+				lda = remain_k2;
+				ldb = 32;
+				if (n_count > 15) {
+					TCONF_TAIL(cfg, tail_m, 16, remain_k2);
+					LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x);
+					for (; n_count > 15; n_count -= 16) {
+						ptr_b0 = ptr_b + 16 * k32;
+						ptr_b1 = ptr_b + 16 * k2;
+						LOAD_C_F(0, 0);
+						LOAD_B(x, 0); LOAD_B_TAIL(x, 1);
+						MATMUL(0, 0); MATMUL_TAIL(1, 1);
+						STORE_C(0, 0);
+						ptr_b += 16 * k;
+						ptr_c00 += 16;
+					}
+				}
+				if (n_count > 0) {
+					int tail_n = (n_count > 16) ? 16: n_count;
+					__mmask16 bmask = (1UL << tail_n) - 1;
+					ptr_b0 =  ptr_b + tail_n * k32;
+					ptr_b1 =  ptr_b + tail_n * k2;
+					ldb = 2 * tail_n;
+					TCONF_TAIL(cfg, tail_m, tail_n, remain_k2);
+					LOAD_C_F(0, 0);
+					LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x);
+					LOAD_B(x, 0); MASK_LOAD_B_TAIL(x, 1);
+					MATMUL(0, 0); MATMUL_TAIL(1, 1);
+					STORE_C(0, 0);
+				}
+			}
+
+		} else if (remain_k2 > 0) { // k%32 = 2x
+			for (; m_count > 0; m_count -= 16) {
+				int tail_m = (m_count > 16) ? 16: m_count;
+
+				ptr_a0 = ptr_a + tail_m * k32;
+				ptr_a += tail_m * k;
+				ptr_b = B;
+				ptr_c00 = ptr_c;
+				ptr_c += tail_m * ldc;
+				n_count = n;
+				lda = remain_k2;
+				ldb = 32;
+				if (n_count > 15) {
+					TCONF(cfg, tail_m, 16, remain_k2);
+					LOAD_A(0, x);
+					for (; n_count > 15; n_count -= 16) {
+						ptr_b0 = ptr_b + 16 * k32;
+						LOAD_C_F(0, 0);
+						LOAD_B(x, 0);
+						MATMUL(0, 0);
+						STORE_C(0, 0);
+						ptr_b += 16 * k;
+						ptr_c00 += 16;
+					}
+				}
+				if (n_count > 0) {
+					int tail_n = (n_count > 16) ? 16: n_count;
+					ptr_b0 =  ptr_b + tail_n * k32;
+					ldb = 2 * tail_n;
+					TCONF(cfg, tail_m, tail_n, remain_k2);
+					LOAD_C_F(0, 0);
+					LOAD_A(0, x);
+					LOAD_B(x, 0);
+					MATMUL(0, 0);
+					STORE_C(0, 0);
+				}
+			}
+		} else { // k%32 = 1
+			for (; m_count > 0; m_count -= 16) {
+				int tail_m = (m_count > 16) ? 16: m_count;
+				__mmask16 amask = (1UL << tail_m) - 1;
+
+				ptr_a0 = ptr_a + tail_m * k2;
+				ptr_a += tail_m * k;
+				ptr_b = B;
+				ptr_c00 = ptr_c;
+				ptr_c += tail_m * ldc;
+				n_count = n;
+				if (n_count > 15) {
+					TCONF(cfg, tail_m, 16, 2);
+					MASK_LOAD_A_TAIL(0, x);
+					for (; n_count > 15; n_count -= 16) {
+						ptr_b0 = ptr_b + 16 * k2;
+						LOAD_C_F(0, 0);
+						LOAD_B_TAIL(x, 0);
+						MATMUL(0, 0);
+						STORE_C(0, 0);
+						ptr_b += 16 * k;
+						ptr_c00 += 16;
+					}
+				}
+				if (n_count > 0) {
+					int tail_n = (n_count > 16) ? 16: n_count;
+					__mmask16 bmask = (1UL << tail_n) - 1;
+					ptr_b0 =  ptr_b + tail_n * k2;
+					TCONF(cfg, tail_m, tail_n, 2);
+					LOAD_C_F(0, 0);
+					MASK_LOAD_A_TAIL(0, x);
+					MASK_LOAD_B_TAIL(x, 0);
+					MATMUL(0, 0);
+					STORE_C(0, 0);
+				}
+			}
+
+		}
+	}
+#ifndef ALPHA_ONE
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+	BLASLONG n16 = n & ~15;
+	BLASLONG noffset;
+	FLOAT *src0, *src1, *src2, *src3;
+	FLOAT *dst0, *dst1, *dst2, *dst3;
+	FLOAT *src = tmp_c;
+	FLOAT *dst = C;
+	m_count = m;
+	for (; m_count > 3; m_count -= 4) {
+		src0 = src;
+		src1 = src0 + ldc;
+		src2 = src1 + ldc;
+		src3 = src2 + ldc;
+		src += 4 * ldc;
+
+		dst0 = dst;
+		dst1 = dst0 + ldc_o;
+		dst2 = dst1 + ldc_o;
+		dst3 = dst2 + ldc_o;
+		dst += 4 * ldc_o;
+
+		noffset = 0;
+		for (; noffset < n16; noffset += 16) {
+			ALPHA_STORE(0);
+			ALPHA_STORE(1);
+			ALPHA_STORE(2);
+			ALPHA_STORE(3);
+		}
+		if (noffset < n) {
+			__mmask16 mask = (1UL << (n - noffset)) - 1;
+			MASK_APLPHA_STORE(0);
+			MASK_APLPHA_STORE(1);
+			MASK_APLPHA_STORE(2);
+			MASK_APLPHA_STORE(3);
+		}
+	}
+	for (; m_count > 1; m_count -= 2) {
+		src0 = src;
+		src1 = src0 + ldc;
+		src += 2 * ldc;
+
+		dst0 = dst;
+		dst1 = dst0 + ldc_o;
+		dst += 2 * ldc_o;
+
+		noffset = 0;
+		for (; noffset < n16; noffset += 16) {
+			ALPHA_STORE(0);
+			ALPHA_STORE(1);
+		}
+		if (noffset < n) {
+			__mmask16 mask = (1UL << (n - noffset)) - 1;
+			MASK_APLPHA_STORE(0);
+			MASK_APLPHA_STORE(1);
+		}
+	}
+	for (; m_count > 0; m_count -= 1) {
+		src0 = src;
+		dst0 = dst;
+		noffset = 0;
+		for (; noffset < n16; noffset += 16) {
+			ALPHA_STORE(0);
+		}
+		if (noffset < n) {
+			__mmask16 mask = (1UL << (n - noffset)) - 1;
+			MASK_APLPHA_STORE(0);
+		}
+	}
+	free(raw_tmp_c);
+#endif
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
new file mode 100644
index 000000000..b94aa3c84
--- /dev/null
+++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c
@@ -0,0 +1,499 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+
+#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr))
+#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr))
+#define BROADCAST64(base, step, n, offset, zmm) \
+	if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \
+	else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2))
+
+#define DECLARE_A_PAIR(A) \
+	__m512i A_lo_##A; __m512i A_hi_##A;
+
+#define LOAD_A_PAIR(A) \
+	VMOVLDUP(ptr_a##A, A_lo_##A); \
+	VMOVHDUP(ptr_a##A, A_hi_##A);
+
+#define MASK_LOAD_A_PAIR(A) { \
+	__m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \
+	A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \
+	A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \
+}
+
+#define LOAD_A_PAIR_TAIL(A) { \
+	__m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \
+	__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
+	A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
+	A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
+}
+
+#define MASK_LOAD_A_PAIR_TAIL(A) { \
+	__m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \
+	__m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \
+	A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \
+	A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \
+}
+
+#define DECLARE_B_PAIR() \
+	__m512i B_lo; __m512i B_hi;
+
+#define PREFETCH_B_STEP 32
+#define PREFETCH_B(Bx, By) \
+	if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \
+	else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2))
+
+#define BROADCAST_B_PAIR(Bx, By) \
+	BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
+	BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi);
+
+#define MASK_BROADCAST_B_PAIR(Bx, x) {\
+	__m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \
+	B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \
+	B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \
+}
+
+#define BROADCAST_B_PAIR_TAIL(Bx, By) {\
+	__m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \
+	xmm = _mm_cvtepu16_epi32(xmm); \
+	B_lo = _mm512_broadcast_i32x2(xmm); \
+	B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
+}
+
+#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\
+	__m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \
+	xmm = _mm_cvtepu16_epi32(xmm); \
+	B_lo = _mm512_broadcast_i32x2(xmm); \
+	B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \
+}
+
+#define DECLARE_RESULT_4X(A, Bx, By) \
+	__m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \
+	__m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \
+	__m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \
+	__m512 result_11_##A##Bx##By = _mm512_setzero_ps();
+
+#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b)
+
+#define MATMUL_4X(A, Bx, By) \
+	FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \
+	FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \
+	FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \
+	FMA(A_hi_##A, B_hi, result_11_##A##Bx##By);
+
+#define _STORE_C_2nx16(addr, val0, val1) \
+	asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
+	asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \
+	asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \
+	asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc))
+
+#define _MASK_STORE_C_2nx16(addr, val0, val1) \
+	asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \
+	asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "Yk"(mmask)); \
+	asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); \
+	asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "Yk"(mmask))
+
+#define _REORDER_C_2X(result_0, result_1) { \
+	__m512 tmp0, tmp1; \
+	tmp0 = _mm512_unpacklo_ps(result_0, result_1); \
+	tmp1 = _mm512_unpackhi_ps(result_0, result_1); \
+	result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \
+	result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \
+}
+
+#define _STORE_2X(ptr_c, result_0, result_1) {\
+	_REORDER_C_2X(result_0, result_1) \
+	_STORE_C_2nx16(ptr_c, result_0, result_1); \
+	ptr_c += ldc * 2; \
+}
+
+#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\
+	_REORDER_C_2X(result_0, result_1) \
+	_MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \
+	ptr_c += ldc * 2; \
+}
+
+#define STORE_4X(A, Bx, By) { \
+	_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
+}
+
+#define MASK_STORE_4X(A, Bx, By) { \
+	_MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \
+}
+
+#define _STORE_C_16(addr, val0) \
+	asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \
+	asm("vmovups %0, (%1)": : "v"(val0), "r"(addr));
+
+#define _MASK_STORE_C_16(addr, val0) \
+	asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \
+	asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask));
+
+#define N_STORE_4X(A, Bx, By) { \
+	_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
+	switch(n_count) { \
+		case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
+		case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
+		case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
+	} \
+	ptr_c##A += ldc * n_count; \
+}
+
+#define N_MASK_STORE_4X(A, Bx, By) { \
+	_REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \
+	_REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \
+	switch(n_count) { \
+		case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \
+		case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \
+		case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \
+	} \
+	ptr_c##A += ldc * n_count; \
+}
+
+
+int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+{
+	IFLOAT *ptr_a = A, *ptr_b = B;
+	IFLOAT *ptr_b0, *ptr_b1;
+	IFLOAT *ptr_a0, *ptr_a1;
+	FLOAT *ptr_c = C;
+	FLOAT *ptr_c0, *ptr_c1;
+	BLASLONG n_count = n;
+	BLASLONG m_count, k_count;
+	BLASLONG n_blksize = 4 * k;
+	BLASLONG cn_offset = 0;
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+
+	for (; n_count > 23; n_count -= 24) {
+		IFLOAT *ptr_b00 = ptr_b;
+		IFLOAT *ptr_b10 = ptr_b + n_blksize * 3;
+		ptr_a0 = ptr_a;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			ptr_b1 = ptr_b10;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
+			k_count = k;
+			for (; k_count > 3; k_count -=4) {
+				LOAD_A_PAIR(0);
+				_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
+				ptr_a0 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4 * 2;
+
+				LOAD_A_PAIR(0);
+				_mm_prefetch(ptr_a0 + 128, _MM_HINT_T0);
+				ptr_a0 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4 * 2;
+			}
+			for (; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				ptr_a0 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += 16;
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
+				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4;
+			}
+			ptr_c0 = ptr_c;
+			STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
+			STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			ptr_b1 = ptr_b10;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				ptr_a0 += m_count * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+				BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4 * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += m_count;
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
+				BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0);
+				BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1);
+				BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2);
+				ptr_b1 += 4;
+			}
+			ptr_c0 = ptr_c;
+			MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
+			MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2);
+			ptr_c += m_count;
+		}
+		ptr_b += 24 * k;
+		cn_offset += 24;
+	}
+	for (; n_count > 11; n_count -= 12) {
+		IFLOAT *ptr_b00 = ptr_b;
+		ptr_a0 = ptr_a;
+		ptr_a1 = ptr_a + 16 * k;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 31; m_count -= 32) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0); DECLARE_A_PAIR(1);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0); LOAD_A_PAIR(1);
+				ptr_a0 += 16 * 2;
+				ptr_a1 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
+				ptr_b0 += 4 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1);
+				ptr_a0 += 16;
+				ptr_a1 += 16;
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2);
+				ptr_b0 += 4;
+			}
+			ptr_c0 = ptr_c;
+			ptr_c1 = ptr_c + 16;
+			STORE_4X(0, 0, 0); STORE_4X(1, 0, 0);
+			STORE_4X(0, 0, 1); STORE_4X(1, 0, 1);
+			STORE_4X(0, 0, 2); STORE_4X(1, 0, 2);
+			ptr_c += 16 * 2;
+			ptr_a0 = ptr_a1;
+			ptr_a1 = ptr_a0 + 16 * k;
+		}
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				ptr_a0 += 16 * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += 16;
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
+			}
+			ptr_c0 = ptr_c;
+			STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				ptr_a0 += m_count * 2;
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4 * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				ptr_a0 += m_count;
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1);
+				BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2);
+				ptr_b0 += 4;
+			}
+			ptr_c0 = ptr_c;
+			MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2);
+			ptr_c += m_count;
+		}
+		ptr_b += 12 * k;
+		cn_offset += 12;
+	}
+	for (; n_count > 3; n_count -= 4) {
+		IFLOAT *ptr_b00 = ptr_b;
+		ptr_a0 = ptr_a;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4;
+				ptr_a0 += 16;
+			}
+			ptr_c0 = ptr_c;
+			STORE_4X(0, 0, 0);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4 * 2;
+				ptr_a0 += m_count * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += 4;
+				ptr_a0 += m_count;
+			}
+			ptr_c0 = ptr_c;
+			MASK_STORE_4X(0, 0, 0);
+			ptr_c += m_count;
+		}
+		ptr_b += 4 * k;
+		cn_offset += 4;
+	}
+	if (n_count > 0) {
+		__mmask8 nmask = (1UL << n_count) - 1;
+		IFLOAT *ptr_b00 = ptr_b;
+		ptr_a0 = ptr_a;
+		ptr_c = C + cn_offset * ldc;
+		m_count = m;
+		for (; m_count > 15; m_count -= 16) {
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				LOAD_A_PAIR(0);
+				MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count * 2;
+				ptr_a0 += 16 * 2;
+			}
+			if (k_count > 0) {
+				LOAD_A_PAIR_TAIL(0);
+				MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count;
+				ptr_a0 += 16;
+			}
+			ptr_c0 = ptr_c;
+			N_STORE_4X(0, 0, 0);
+			ptr_c += 16;
+		}
+		if (m_count > 0) {
+			__mmask16 mmask = (1UL << m_count) - 1;
+			ptr_b0 = ptr_b00;
+			DECLARE_A_PAIR(0);
+			DECLARE_B_PAIR();
+			DECLARE_RESULT_4X(0, 0, 0);
+			for (k_count = k; k_count > 1; k_count -=2) {
+				MASK_LOAD_A_PAIR(0);
+				MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count * 2;
+				ptr_a0 += m_count * 2;
+			}
+			if (k_count > 0) {
+				MASK_LOAD_A_PAIR_TAIL(0);
+				MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0);
+				ptr_b0 += n_count;
+				ptr_a0 += m_count;
+			}
+			ptr_c0 = ptr_c;
+			N_MASK_STORE_4X(0, 0, 0);
+			ptr_c += m_count;
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c
new file mode 100644
index 000000000..bd5cbb744
--- /dev/null
+++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c
@@ -0,0 +1,1835 @@
+#include "bf16_common_macros.h"
+#include <immintrin.h>
+
+#define BF16_BLOCK_STEP_N 8
+#define BF16_BLOCK_THRES_K 1024
+#define BF16_BLOCK_THRES_M 32
+#define BF16_BLOCK_THRES_N 1024
+
+#define A(i,j) A[(i)*lda+(j)]
+#define B(i,j) B[(i)*ldb+(j)]
+#define C(i,j) C[(i)*ldc+(j)]
+
+#define ONE  1.e0f
+#define ZERO  0.e0f
+
+#define SHUFFLE_MAGIC_NO (const int) 0x39
+
+#undef STORE16_COMPLETE_RESULT
+#undef STORE16_MASK_COMPLETE_RESULT
+#undef SBGEMM_BLOCK_KERNEL_NN_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_NN_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_NN_32xNx32
+#undef SBGEMM_BLOCK_KERNEL_NN_16xNx32
+#undef SBGEMM_BLOCK_KERNEL_NT_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_NT_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_NT_32xNxK
+#undef SBGEMM_BLOCK_KERNEL_NT_16xNxK
+#undef SBGEMM_BLOCK_KERNEL_TN_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_TN_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_TN_32xNx32
+#undef SBGEMM_BLOCK_KERNEL_TN_16xNx32
+#undef SBGEMM_BLOCK_KERNEL_TT_32x8xK
+#undef SBGEMM_BLOCK_KERNEL_TT_16x8xK
+#undef SBGEMM_BLOCK_KERNEL_TT_32xNxK
+#undef SBGEMM_BLOCK_KERNEL_TT_16xNxK
+#undef SBGEMM_BLOCKING_KERNEL_NN
+#undef SBGEMM_BLOCKING_KERNEL_NT
+#undef SBGEMM_BLOCKING_KERNEL_TN
+#undef SBGEMM_BLOCKING_KERNEL_TT
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+    #define STORE16_COMPLETE_RESULT          STORE16_COMPLETE_RESULT_ALPHA_ONE
+    #define STORE16_MASK_COMPLETE_RESULT     STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+    #define SBGEMM_BLOCK_KERNEL_NN_32x8xK    sbgemm_block_kernel_nn_32x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_NN_16x8xK    sbgemm_block_kernel_nn_16x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_NN_32xNx32   sbgemm_block_kernel_nn_32xNx32_alpha
+    #define SBGEMM_BLOCK_KERNEL_NN_16xNx32   sbgemm_block_kernel_nn_16xNx32_alpha
+
+    #define SBGEMM_BLOCK_KERNEL_NT_32x8xK    SBGEMM_BLOCK_KERNEL_NN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_16x8xK    SBGEMM_BLOCK_KERNEL_NN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_32xNxK    sbgemm_block_kernel_nt_32xNxK_alpha
+    #define SBGEMM_BLOCK_KERNEL_NT_16xNxK    sbgemm_block_kernel_nt_16xNxK_alpha
+
+    #define SBGEMM_BLOCK_KERNEL_TN_32x8xK    sbgemm_block_kernel_tn_32x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_TN_16x8xK    sbgemm_block_kernel_tn_16x8xK_alpha
+    #define SBGEMM_BLOCK_KERNEL_TN_32xNx32   sbgemm_block_kernel_tn_32xNx32_alpha
+    #define SBGEMM_BLOCK_KERNEL_TN_16xNx32   sbgemm_block_kernel_tn_16xNx32_alpha
+
+    #define SBGEMM_BLOCK_KERNEL_TT_32x8xK    SBGEMM_BLOCK_KERNEL_TN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_16x8xK    SBGEMM_BLOCK_KERNEL_TN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_32xNxK    sbgemm_block_kernel_tt_32xNxK_alpha
+    #define SBGEMM_BLOCK_KERNEL_TT_16xNxK    sbgemm_block_kernel_tt_16xNxK_alpha
+
+    #define SBGEMM_BLOCKING_KERNEL_NN        sbgemm_blocking_kernel_nn_alpha
+    #define SBGEMM_BLOCKING_KERNEL_NT        sbgemm_blocking_kernel_nt_alpha
+    #define SBGEMM_BLOCKING_KERNEL_TN        sbgemm_blocking_kernel_tn_alpha
+    #define SBGEMM_BLOCKING_KERNEL_TT        sbgemm_blocking_kernel_tt_alpha
+#else                  // ALPHA is ONE
+    #define STORE16_COMPLETE_RESULT          STORE16_COMPLETE_RESULT_ONE_ONE
+    #define STORE16_MASK_COMPLETE_RESULT     STORE16_MASK_COMPLETE_RESULT_ONE_ONE
+
+    #define SBGEMM_BLOCK_KERNEL_NN_32x8xK    sbgemm_block_kernel_nn_32x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_NN_16x8xK    sbgemm_block_kernel_nn_16x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_NN_32xNx32   sbgemm_block_kernel_nn_32xNx32_one
+    #define SBGEMM_BLOCK_KERNEL_NN_16xNx32   sbgemm_block_kernel_nn_16xNx32_one
+
+    #define SBGEMM_BLOCK_KERNEL_NT_32x8xK    SBGEMM_BLOCK_KERNEL_NN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_16x8xK    SBGEMM_BLOCK_KERNEL_NN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_NT_32xNxK    sbgemm_block_kernel_nt_32xNxK_one
+    #define SBGEMM_BLOCK_KERNEL_NT_16xNxK    sbgemm_block_kernel_nt_16xNxK_one
+
+    #define SBGEMM_BLOCK_KERNEL_TN_32x8xK    sbgemm_block_kernel_tn_32x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_TN_16x8xK    sbgemm_block_kernel_tn_16x8xK_one
+    #define SBGEMM_BLOCK_KERNEL_TN_32xNx32   sbgemm_block_kernel_tn_32xNx32_one
+    #define SBGEMM_BLOCK_KERNEL_TN_16xNx32   sbgemm_block_kernel_tn_16xNx32_one
+
+    #define SBGEMM_BLOCK_KERNEL_TT_32x8xK    SBGEMM_BLOCK_KERNEL_TN_32x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_16x8xK    SBGEMM_BLOCK_KERNEL_TN_16x8xK
+    #define SBGEMM_BLOCK_KERNEL_TT_32xNxK    sbgemm_block_kernel_tt_32xNxK_one
+    #define SBGEMM_BLOCK_KERNEL_TT_16xNxK    sbgemm_block_kernel_tt_16xNxK_one
+
+    #define SBGEMM_BLOCKING_KERNEL_NN        sbgemm_blocking_kernel_nn_one
+    #define SBGEMM_BLOCKING_KERNEL_NT        sbgemm_blocking_kernel_nt_one
+    #define SBGEMM_BLOCKING_KERNEL_TN        sbgemm_blocking_kernel_tn_one
+    #define SBGEMM_BLOCKING_KERNEL_TT        sbgemm_blocking_kernel_tt_one
+#endif
+
+extern bfloat16 * block_A;
+extern bfloat16 * block_B;
+
+/* --------------------------------------------- NN kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N=8, K can be any number, but the processing will take 32 as a base
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nn_32x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nn_32x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
+    __m512  result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7,
+            result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15;
+    __m512  result_512_tmp_0, result_512_tmp_1, result_512_tmp_2, result_512_tmp_3;
+
+    __m512i M512_EPI32_8      = _mm512_set1_epi32(8);
+    __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
+    __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
+
+    result_512_0  = _mm512_setzero_ps();
+    result_512_1  = _mm512_setzero_ps();
+    result_512_2  = _mm512_setzero_ps();
+    result_512_3  = _mm512_setzero_ps();
+    result_512_4  = _mm512_setzero_ps();
+    result_512_5  = _mm512_setzero_ps();
+    result_512_6  = _mm512_setzero_ps();
+    result_512_7  = _mm512_setzero_ps();
+    result_512_8  = _mm512_setzero_ps();
+    result_512_9  = _mm512_setzero_ps();
+    result_512_10 = _mm512_setzero_ps();
+    result_512_11 = _mm512_setzero_ps();
+    result_512_12 = _mm512_setzero_ps();
+    result_512_13 = _mm512_setzero_ps();
+    result_512_14 = _mm512_setzero_ps();
+    result_512_15 = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        result_512_8  = _mm512_dpbf16_ps(result_512_8,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_0);
+        result_512_9  = _mm512_dpbf16_ps(result_512_9,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_1);
+        result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_2);
+        result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_3);
+        result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_4);
+        result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_5);
+        result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_6);
+        result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_7);
+
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 64
+        A_addr += 64;
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*1))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*1 + 16), tail_mask)
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*2))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*2 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*3))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*3 + 16), tail_mask)
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*4))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*4 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*5))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*5 + 16), tail_mask)
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*6))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*6 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*7))
+        STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*7 + 16), tail_mask)
+    } else {
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*1))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*1 + 16))
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*2 + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*3))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*3 + 16))
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*4 + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*5 + 16))
+        result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
+        result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
+        result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
+        result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
+        STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*6 + 16))
+        STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*7))
+        STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*7 + 16))
+    }
+}
+
+// SBGEMM Kernel for M<=16, N=8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
+    __m512  result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7;
+
+    result_512_0  = _mm512_setzero_ps();
+    result_512_1  = _mm512_setzero_ps();
+    result_512_2  = _mm512_setzero_ps();
+    result_512_3  = _mm512_setzero_ps();
+    result_512_4  = _mm512_setzero_ps();
+    result_512_5  = _mm512_setzero_ps();
+    result_512_6  = _mm512_setzero_ps();
+    result_512_7  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        // Load two rows into a 512 register
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7,  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 16
+        A_addr += 32;
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+
+        result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
+        result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
+        result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
+        result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
+        STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask)
+        result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
+        result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
+        result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
+        result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
+        STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask)
+    } else {
+        result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
+        result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
+        result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
+        result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
+        STORE16_COMPLETE_RESULT(result_512_0, (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1))
+        STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3))
+        result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
+        result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
+        result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
+        result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
+        STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7))
+    }
+}
+
+// SBGEMM Kernel for 16<M<=32, N<8, K can be any number, but the processing will take 32 as a base
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nn_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512[2];
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+    __m512  result_512_tmp_0, result_512_tmp_1;
+
+    __m512i M512_EPI32_8      = _mm512_set1_epi32(8);
+    __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
+    __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
+
+    for (int i = 0; i < 15; i += 2) {
+        result_512[i]    = _mm512_setzero_ps();
+        result_512[i+1]  = _mm512_setzero_ps();
+    }
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (tag_k_32x != k) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        BLASLONG width = k - tag_k_32x;
+        for (BLASLONG idx = 0; idx < width;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i++) {
+            result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
+            result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
+            result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    for (int i = 0; i < 8; i += 2) {
+        result_512[i]    = _mm512_setzero_ps();
+        result_512[i+1]  = _mm512_setzero_ps();
+    }
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            // Load two rows into a 512 register
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i ++) {
+                result_512[i]  = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (tag_k_32x != k) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        BLASLONG width = k - tag_k_32x;
+        for (BLASLONG idx = 0; idx < width;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            // Load two rows into a 512 register
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]  = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
+        while (n_from < N) {
+            for (BLASLONG idx_k = 0; idx_k < K;) {
+                // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A);
+                for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                    // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                    COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                }
+
+                if (tag_n_Nx != n_to) {
+                    n_step = n_to - tag_n_Nx;
+                    COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
+                }
+
+                idx_k += k_step;
+                k_step = K - idx_k;
+                k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                k_step_round32 = k_step & (~31);
+                k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+            }
+
+            n_from = n_to;
+            n_to += BF16_BLOCK_THRES_N;
+            n_to = (n_to > N) ? N : n_to;
+            tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));           
+        }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of NN kernels --------------------------------------- */
+
+/* --------------------------------------------- NT kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nt_32xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nt_32xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+    __m512  result_512_tmp_0, result_512_tmp_1;
+
+    __m512i M512_EPI32_8      = _mm512_set1_epi32(8);
+    __m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
+    __m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+    result_512[8]  = _mm512_setzero_ps();
+    result_512[9]  = _mm512_setzero_ps();
+    result_512[10] = _mm512_setzero_ps();
+    result_512[11] = _mm512_setzero_ps();
+    result_512[12] = _mm512_setzero_ps();
+    result_512[13] = _mm512_setzero_ps();
+    result_512[14] = _mm512_setzero_ps();
+    result_512[15] = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+        A_addr += 64;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+            result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8],  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i ++) {
+            result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
+            result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i ++) {
+            result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
+            result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
+            STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for M<=16, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_nt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_nt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 16-pair bf16 elements
+        // Load two rows into a 512 register
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        A_addr += 32;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
+        while (n_from < N) {
+            for (BLASLONG idx_k = 0; idx_k < K;) {
+                // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A);
+                for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                    // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                }
+
+                if (tag_n_Nx != n_to) {
+                    n_step = n_to - tag_n_Nx;
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
+                }
+
+                idx_k += k_step;
+                k_step = K - idx_k;
+                k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                k_step_round32 = k_step & (~31);
+                k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+            }
+
+            n_from = n_to;
+            n_to += BF16_BLOCK_THRES_N;
+            n_to = (n_to > N) ? N : n_to;
+            tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+        }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of NT kernels --------------------------------------- */
+
+/* --------------------------------------------- TN kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N=8, K=Any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_32x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_32x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
+    __m512  result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7,
+            result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15;
+
+    result_512_0  = _mm512_setzero_ps();
+    result_512_1  = _mm512_setzero_ps();
+    result_512_2  = _mm512_setzero_ps();
+    result_512_3  = _mm512_setzero_ps();
+    result_512_4  = _mm512_setzero_ps();
+    result_512_5  = _mm512_setzero_ps();
+    result_512_6  = _mm512_setzero_ps();
+    result_512_7  = _mm512_setzero_ps();
+    result_512_8  = _mm512_setzero_ps();
+    result_512_9  = _mm512_setzero_ps();
+    result_512_10 = _mm512_setzero_ps();
+    result_512_11 = _mm512_setzero_ps();
+    result_512_12 = _mm512_setzero_ps();
+    result_512_13 = _mm512_setzero_ps();
+    result_512_14 = _mm512_setzero_ps();
+    result_512_15 = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Load 32 pair of BF16 elements from A (32 rows)
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+
+        // Load 8 rows of B
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        result_512_8  = _mm512_dpbf16_ps(result_512_8,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_0);
+        result_512_9  = _mm512_dpbf16_ps(result_512_9,  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_1);
+        result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_2);
+        result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_3);
+        result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_4);
+        result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_5);
+        result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_6);
+        result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) arrayB_512_7);
+
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 64
+        A_addr += 64;
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        STORE16_COMPLETE_RESULT(result_512_0,  (C_addr))
+        STORE16_MASK_COMPLETE_RESULT(result_512_8,  (C_addr + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_1,  (C_addr + ldc))
+        STORE16_MASK_COMPLETE_RESULT(result_512_9,  (C_addr + ldc + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2))
+        STORE16_MASK_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3))
+        STORE16_MASK_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4))
+        STORE16_MASK_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5))
+        STORE16_MASK_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6))
+        STORE16_MASK_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16), tail_mask)
+        STORE16_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7))
+        STORE16_MASK_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16), tail_mask)
+    } else {
+        STORE16_COMPLETE_RESULT(result_512_0,  (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_8,  (C_addr + 16))
+        STORE16_COMPLETE_RESULT(result_512_1,  (C_addr + ldc))
+        STORE16_COMPLETE_RESULT(result_512_9,  (C_addr + ldc + 16))
+        STORE16_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16))
+        STORE16_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3))
+        STORE16_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16))
+        STORE16_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16))
+        STORE16_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16))
+        STORE16_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16))
+        STORE16_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7))
+        STORE16_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16))
+    }
+}
+
+// SBGEMM Kernel for M=16, N=8, K=Any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
+    __m512  result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7;
+
+    result_512_0  = _mm512_setzero_ps();
+    result_512_1  = _mm512_setzero_ps();
+    result_512_2  = _mm512_setzero_ps();
+    result_512_3  = _mm512_setzero_ps();
+    result_512_4  = _mm512_setzero_ps();
+    result_512_5  = _mm512_setzero_ps();
+    result_512_6  = _mm512_setzero_ps();
+    result_512_7  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Load 16 pair of BF16 elements from A (16 rows)
+        arrayA_512_0 = _mm512_loadu_si512(A_addr + 0);
+
+        // Load 8 rows of B
+        _MM512_BROADCASTD_EPI32(B_addr + 0,  arrayB_512_0);
+        _MM512_BROADCASTD_EPI32(B_addr + 2,  arrayB_512_1);
+        _MM512_BROADCASTD_EPI32(B_addr + 4,  arrayB_512_2);
+        _MM512_BROADCASTD_EPI32(B_addr + 6,  arrayB_512_3);
+        _MM512_BROADCASTD_EPI32(B_addr + 8,  arrayB_512_4);
+        _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5);
+        _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6);
+        _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7);
+
+        result_512_0  = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0);
+        result_512_1  = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1);
+        result_512_2  = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2);
+        result_512_3  = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3);
+        result_512_4  = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4);
+        result_512_5  = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5);
+        result_512_6  = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6);
+        result_512_7  = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7);
+
+        // Load B with unroll 8
+        B_addr += 16;
+        // Load A with unroll 32
+        A_addr += 32;
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        STORE16_MASK_COMPLETE_RESULT(result_512_0,  (C_addr), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_1,  (C_addr + ldc), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6), tail_mask)
+        STORE16_MASK_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7), tail_mask)
+    } else {
+        STORE16_COMPLETE_RESULT(result_512_0,  (C_addr))
+        STORE16_COMPLETE_RESULT(result_512_1,  (C_addr + ldc))
+        STORE16_COMPLETE_RESULT(result_512_2,  (C_addr + ldc*2))
+        STORE16_COMPLETE_RESULT(result_512_3,  (C_addr + ldc*3))
+        STORE16_COMPLETE_RESULT(result_512_4,  (C_addr + ldc*4))
+        STORE16_COMPLETE_RESULT(result_512_5,  (C_addr + ldc*5))
+        STORE16_COMPLETE_RESULT(result_512_6,  (C_addr + ldc*6))
+        STORE16_COMPLETE_RESULT(result_512_7,  (C_addr + ldc*7))
+    }
+}
+
+// SBGEMM Kernel for 16<M<=32, N<8, K=Any number but will be processed based on 32
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512[2];
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+
+    for (int i = 0; i < 15; i++) {
+        result_512[i] = _mm512_setzero_ps();
+    }
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (tag_k_32x != k) {
+            // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        BLASLONG width = k - tag_k_32x;
+        for (BLASLONG idx = 0; idx < width;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512[0] = _mm512_loadu_si512(A_addr);
+            arrayA_512[1] = _mm512_loadu_si512(A_addr + 32);
+            A_addr += 64;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i]  , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for M<=16, N<8, K=Any number but will be processed based on 32
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+    BLASLONG tag_k_32x = k & (~31);
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    for (int i = 0; i < 8; i++) {
+        result_512[i]    = _mm512_setzero_ps();
+    }
+
+    for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        for (BLASLONG idx = 0; idx < 32;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (tag_k_32x != k) {
+        // Load B with unroll n
+        for (int i = 0; i < n; i ++) {
+            arrayB_512[i] = _mm512_loadu_si512(B_addr);
+            B_addr += 32;
+        }
+
+        BLASLONG width = k - tag_k_32x;
+        for (BLASLONG idx = 0; idx < width;) {
+            // Each two rows are a group for 32-pair bf16 elements
+            arrayA_512 = _mm512_loadu_si512(A_addr);
+            A_addr += 32;
+
+            for (int i = 0; i < n; i++) {
+                result_512[i]   = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
+                arrayB_512[i]   = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
+            }
+
+            idx += 2;
+            // Every 4 loops we need to switch to next 128 bits of arrayB registers
+            if ((idx & (~7)) == idx) {
+                for (int i = 0; i < n; i++) {
+                    arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
+                }
+            }
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
+        while (n_from < N) {
+            for (BLASLONG idx_k = 0; idx_k < K;) {
+                // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A);
+                for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                    // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                    COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); // TODO how to process m
+                }
+
+                if (tag_n_Nx != n_to) {
+                    n_step = n_to - tag_n_Nx;
+                    COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
+                }
+
+                idx_k += k_step;
+                k_step = K - idx_k;
+                k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                k_step_round32 = k_step & (~31);
+                k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+            }
+
+            n_from = n_to;
+            n_to += BF16_BLOCK_THRES_N;
+            n_to = (n_to > N) ? N : n_to;
+            tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+        }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of TN kernels --------------------------------------- */
+
+/* --------------------------------------------- TT kernels ------------------------------------------ */
+// SBGEMM Kernel for 16<M<=32, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tt_32xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tt_32xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0, arrayA_512_1;
+    __m512i arrayB_512[8];
+    __m512  result_512[16];
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+    result_512[8]  = _mm512_setzero_ps();
+    result_512[9]  = _mm512_setzero_ps();
+    result_512[10] = _mm512_setzero_ps();
+    result_512[11] = _mm512_setzero_ps();
+    result_512[12] = _mm512_setzero_ps();
+    result_512[13] = _mm512_setzero_ps();
+    result_512[14] = _mm512_setzero_ps();
+    result_512[15] = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 32-pair bf16 elements
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        arrayA_512_1 = _mm512_loadu_si512(A_addr + 32);
+        A_addr += 64;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+            result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8],  (__m512bh) arrayA_512_1, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 32) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m));
+        for (int i = 0; i < n; i ++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i ++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+            STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16))
+        }
+    }
+}
+
+// SBGEMM Kernel for M<=16, N<8, K can be any number
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_block_kernel_tt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#else                  // ALPHA is ONE
+void sbgemm_block_kernel_tt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
+#endif
+{
+    bfloat16 * A_addr = A;
+    bfloat16 * B_addr = B;
+    float    * C_addr = C;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+
+    __m512i arrayA_512_0;
+    __m512i arrayB_512[8];
+    __m512  result_512[8];
+
+    result_512[0]  = _mm512_setzero_ps();
+    result_512[1]  = _mm512_setzero_ps();
+    result_512[2]  = _mm512_setzero_ps();
+    result_512[3]  = _mm512_setzero_ps();
+    result_512[4]  = _mm512_setzero_ps();
+    result_512[5]  = _mm512_setzero_ps();
+    result_512[6]  = _mm512_setzero_ps();
+    result_512[7]  = _mm512_setzero_ps();
+
+    for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) {
+        // Each two rows are a group for 16-pair bf16 elements
+        // Load two rows into a 512 register
+        arrayA_512_0 = _mm512_loadu_si512(A_addr);
+        A_addr += 32;
+
+        for (int i = 0; i < n; i ++) {
+            _MM512_BROADCASTD_EPI32(B_addr + i*2,  arrayB_512[i]);
+        }
+        B_addr += 16;
+
+        for (int i = 0; i < n; i ++) {
+            result_512[i] = _mm512_dpbf16_ps(result_512[i],  (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]);
+        }
+    }
+
+    if (m != 16) {
+        unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m));
+        for (int i = 0; i < n; i++) {
+            STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask)
+        }
+    } else {
+        for (int i = 0; i < n; i++) {
+            STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i))
+        }
+    }
+}
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#else                  // ALPHA is ONE
+void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
+#endif
+{
+    BLASLONG m_step, n_step, k_step, k_step_round32;
+    BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
+
+    BLASLONG n_from, n_to;
+    BLASLONG tag_n_Nx;
+
+    n_from = 0;
+    n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
+    tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+
+    k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
+    k_step_round32 = k_step & (~31);
+    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+
+    if (M >= BF16_BLOCK_THRES_M) {
+        while (n_from < N) {
+            for (BLASLONG idx_k = 0; idx_k < K;) {
+                // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A);
+                for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                    // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                }
+
+                if (tag_n_Nx != n_to) {
+                    n_step = n_to - tag_n_Nx;
+                    COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                    SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                }
+
+                for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
+                    }
+                }
+
+                if (tag_m_Nx != M) {
+                    m_step = M - tag_m_Nx;
+                    if (m_step > 16) {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    } else {
+                        COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A);
+                        for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                            SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
+                        }
+
+                        if (tag_n_Nx != n_to) {
+                            n_step = n_to - tag_n_Nx;
+                            SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
+                        }
+                    }
+                }
+
+                idx_k += k_step;
+                k_step = K - idx_k;
+                k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                k_step_round32 = k_step & (~31);
+                k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+            }
+
+            n_from = n_to;
+            n_to += BF16_BLOCK_THRES_N;
+            n_to = (n_to > N) ? N : n_to;
+            tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+        }
+    } else {
+        m_step = M;
+        if (m_step > 16) {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        } else {
+            while (n_from < N) {
+                for (BLASLONG idx_k = 0; idx_k < K;) {
+                    // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
+                    COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A);
+                    for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
+                        // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
+                    }
+
+                    if (tag_n_Nx != n_to) {
+                        n_step = n_to - tag_n_Nx;
+                        COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
+                        SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
+                    }
+
+                    idx_k += k_step;
+                    k_step = K - idx_k;
+                    k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
+                    k_step_round32 = k_step & (~31);
+                    k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
+                }
+                n_from = n_to;
+                n_to += BF16_BLOCK_THRES_N;
+                n_to = (n_to > N) ? N : n_to;
+                tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
+            }
+        }
+    }
+}
+/* ----------------------------------------- End of TT kernels --------------------------------------- */
+
+/*
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
+#else                  // ALPHA is ONE
+void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
+#endif
+{
+    if (Order == CblasColMajor) {
+        if (TransA == CblasNoTrans) {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_NN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_NT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            }
+        } else {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_TN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_TT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+            }
+        }
+    } else {
+        if (TransA == CblasNoTrans) {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_NN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_TN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            }
+        } else {
+            if (TransB == CblasNoTrans) {
+                SBGEMM_BLOCKING_KERNEL_NT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            } else if (TransB == CblasTrans) {
+                SBGEMM_BLOCKING_KERNEL_TT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B);
+            }
+        }
+    }
+}
+*/
diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
new file mode 100644
index 000000000..7ed03d70d
--- /dev/null
+++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c
@@ -0,0 +1,353 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <immintrin.h>
+#include "common.h"
+
+#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \
+	asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8))
+
+#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \
+	__m512i v; \
+	t0 = _mm512_unpacklo_epi32(r0, r1); \
+	t1 = _mm512_unpackhi_epi32(r0, r1); \
+	t2 = _mm512_unpacklo_epi32(r2, r3); \
+	t3 = _mm512_unpackhi_epi32(r2, r3); \
+	t4 = _mm512_unpacklo_epi32(r4, r5); \
+	t5 = _mm512_unpackhi_epi32(r4, r5); \
+	t6 = _mm512_unpacklo_epi32(r6, r7); \
+	t7 = _mm512_unpackhi_epi32(r6, r7); \
+	_MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \
+	r0 = _mm512_mask_blend_epi32(kc, t0, v); \
+	r1 = _mm512_mask_blend_epi32(k3, t2, v); \
+	_MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \
+	r2 = _mm512_mask_blend_epi32(kc, t1, v); \
+	r3 = _mm512_mask_blend_epi32(k3, t3, v); \
+	_MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \
+	r4 = _mm512_mask_blend_epi32(kc, t4, v); \
+	r5 = _mm512_mask_blend_epi32(k3, t6, v); \
+	_MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \
+	r6 = _mm512_mask_blend_epi32(kc, t5, v); \
+	r7 = _mm512_mask_blend_epi32(k3, t7, v); \
+	t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \
+	t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \
+	t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \
+	t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \
+	t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \
+	t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \
+	t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \
+	t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \
+}
+
+#define STORE_512_LO(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
+	_mm512_storeu_si512(boffset0 + x*32, v);
+
+#define STORE_512_HI(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
+	_mm512_storeu_si512(boffset0 + (x + 8)*32, v);
+
+#define MASK_STORE_512_LO(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
+	_mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v);
+
+#define MASK_STORE_512_HI(x) \
+	v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
+	_mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v);
+
+#define STORE_512(x, y) {\
+	__m512i v; \
+	if (x == 0) { STORE_512_LO(y); } \
+	else { STORE_512_HI(y); } \
+}
+
+#define MASK_STORE_512(x, y) {\
+	__m512i v; \
+	if (x == 0) { MASK_STORE_512_LO(y); } \
+	else { MASK_STORE_512_HI(y); } \
+}
+
+#define SET_TAIL(y, x) {\
+	if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \
+	else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \
+}
+
+#define GET_TAIL() \
+	switch (n_store + 1) { \
+		case 16: SET_TAIL(1, 7); break; \
+		case 15: SET_TAIL(1, 6); break; \
+		case 14: SET_TAIL(1, 5); break; \
+		case 13: SET_TAIL(1, 4); break; \
+		case 12: SET_TAIL(1, 3); break; \
+		case 11: SET_TAIL(1, 2); break; \
+		case 10: SET_TAIL(1, 1); break; \
+		case  9: SET_TAIL(1, 0); break; \
+		case  8: SET_TAIL(0, 7); break; \
+		case  7: SET_TAIL(0, 6); break; \
+		case  6: SET_TAIL(0, 5); break; \
+		case  5: SET_TAIL(0, 4); break; \
+		case  4: SET_TAIL(0, 3); break; \
+		case  3: SET_TAIL(0, 2); break; \
+		case  2: SET_TAIL(0, 1); break; \
+		case  1: SET_TAIL(0, 0); break; \
+	}
+
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
+
+	IFLOAT *boffset0;
+	IFLOAT *aoffset;
+	IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07;
+	IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17;
+	aoffset = a;
+	boffset0   = b;
+
+	BLASLONG n16 = n & ~15;
+	BLASLONG m32 = m & ~31;
+
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+	uint64_t permute_table2[] = {
+		0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3,
+		0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7,
+	};
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
+	__m512i idx_lo2 = _mm512_loadu_si512(permute_table2);
+	__m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8);
+	__mmask16 kc = 0xcccc;
+	__mmask16 k3 = 0x3333;
+	__m512i r0, r1, r2, r3, r4, r5, r6, r7;
+	__m512i t00, t01, t02, t03, t04, t05, t06, t07;
+	__m512i t10, t11, t12, t13, t14, t15, t16, t17;
+
+	for (j = 0; j < n16; j += 16) {
+		aoffset00 = aoffset;
+		aoffset01 = aoffset00 + lda;
+		aoffset02 = aoffset01 + lda;
+		aoffset03 = aoffset02 + lda;
+		aoffset04 = aoffset03 + lda;
+		aoffset05 = aoffset04 + lda;
+		aoffset06 = aoffset05 + lda;
+		aoffset07 = aoffset06 + lda;
+		aoffset10 = aoffset07 + lda;
+		aoffset11 = aoffset10 + lda;
+		aoffset12 = aoffset11 + lda;
+		aoffset13 = aoffset12 + lda;
+		aoffset14 = aoffset13 + lda;
+		aoffset15 = aoffset14 + lda;
+		aoffset16 = aoffset15 + lda;
+		aoffset17 = aoffset16 + lda;
+		aoffset += 16 * lda;
+		for (i = 0; i < m32; i += 32) {
+			r0 = _mm512_loadu_si512(aoffset00 + i);
+			r1 = _mm512_loadu_si512(aoffset01 + i);
+			r2 = _mm512_loadu_si512(aoffset02 + i);
+			r3 = _mm512_loadu_si512(aoffset03 + i);
+			r4 = _mm512_loadu_si512(aoffset04 + i);
+			r5 = _mm512_loadu_si512(aoffset05 + i);
+			r6 = _mm512_loadu_si512(aoffset06 + i);
+			r7 = _mm512_loadu_si512(aoffset07 + i);
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			r0 = _mm512_loadu_si512(aoffset10 + i);
+			r1 = _mm512_loadu_si512(aoffset11 + i);
+			r2 = _mm512_loadu_si512(aoffset12 + i);
+			r3 = _mm512_loadu_si512(aoffset13 + i);
+			r4 = _mm512_loadu_si512(aoffset14 + i);
+			r5 = _mm512_loadu_si512(aoffset15 + i);
+			r6 = _mm512_loadu_si512(aoffset16 + i);
+			r7 = _mm512_loadu_si512(aoffset17 + i);
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3);
+			STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7);
+			STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3);
+			STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7);
+			boffset0 += 16 * 32;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask32 mmask = (1UL << remain_m) - 1;
+			r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
+			r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
+			r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
+			r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
+			r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
+			r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
+			r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
+			r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
+			r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
+			r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
+			r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
+			r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
+			r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
+			r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
+			r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			int n_store = remain_m/2;
+			switch (n_store) {
+				case 15: STORE_512(1, 6);
+				case 14: STORE_512(1, 5);
+				case 13: STORE_512(1, 4);
+				case 12: STORE_512(1, 3);
+				case 11: STORE_512(1, 2);
+				case 10: STORE_512(1, 1);
+				case  9: STORE_512(1, 0);
+				case  8: STORE_512(0, 7);
+				case  7: STORE_512(0, 6);
+				case  6: STORE_512(0, 5);
+				case  5: STORE_512(0, 4);
+				case  4: STORE_512(0, 3);
+				case  3: STORE_512(0, 2);
+				case  2: STORE_512(0, 1);
+				case  1: STORE_512(0, 0);
+			}
+			boffset0 += n_store * 32;
+			if (m & 0x1) {
+				__m512i tail;
+				GET_TAIL();
+				_mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail));
+				boffset0 += 16;
+			}
+		}
+
+	}
+	if (j < n) {
+		int remain_n = n - j;
+		__mmask16 nmask = (1UL << remain_n) - 1;
+		int load0, load1;
+		if (remain_n > 8) {
+			load0 = 8;
+			load1 = remain_n - 8;
+		} else {
+			load0 = remain_n;
+			load1 = 0;
+		}
+		aoffset00 = aoffset;
+		aoffset01 = aoffset00 + lda;
+		aoffset02 = aoffset01 + lda;
+		aoffset03 = aoffset02 + lda;
+		aoffset04 = aoffset03 + lda;
+		aoffset05 = aoffset04 + lda;
+		aoffset06 = aoffset05 + lda;
+		aoffset07 = aoffset06 + lda;
+		aoffset10 = aoffset07 + lda;
+		aoffset11 = aoffset10 + lda;
+		aoffset12 = aoffset11 + lda;
+		aoffset13 = aoffset12 + lda;
+		aoffset14 = aoffset13 + lda;
+		aoffset15 = aoffset14 + lda;
+		aoffset16 = aoffset15 + lda;
+		aoffset17 = aoffset16 + lda;
+		aoffset += 16 * lda;
+		for (i = 0; i < m32; i += 32) {
+			switch (load0) {
+				case 8: r7 = _mm512_loadu_si512(aoffset07 + i);
+				case 7: r6 = _mm512_loadu_si512(aoffset06 + i);
+				case 6: r5 = _mm512_loadu_si512(aoffset05 + i);
+				case 5: r4 = _mm512_loadu_si512(aoffset04 + i);
+				case 4: r3 = _mm512_loadu_si512(aoffset03 + i);
+				case 3: r2 = _mm512_loadu_si512(aoffset02 + i);
+				case 2: r1 = _mm512_loadu_si512(aoffset01 + i);
+				case 1: r0 = _mm512_loadu_si512(aoffset00 + i);
+			}
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			switch (load1) {
+				case 8: r7 = _mm512_loadu_si512(aoffset17 + i);
+				case 7: r6 = _mm512_loadu_si512(aoffset16 + i);
+				case 6: r5 = _mm512_loadu_si512(aoffset15 + i);
+				case 5: r4 = _mm512_loadu_si512(aoffset14 + i);
+				case 4: r3 = _mm512_loadu_si512(aoffset13 + i);
+				case 3: r2 = _mm512_loadu_si512(aoffset12 + i);
+				case 2: r1 = _mm512_loadu_si512(aoffset11 + i);
+				case 1: r0 = _mm512_loadu_si512(aoffset10 + i);
+			}
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7);
+			MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3);
+			MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7);
+			boffset0 += remain_n * 32;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask32 mmask = (1UL << remain_m) - 1;
+			switch (load0) {
+				case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i);
+				case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i);
+				case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i);
+				case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i);
+				case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i);
+				case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i);
+				case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i);
+				case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i);
+			}
+			REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07);
+			switch (load1) {
+				case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i);
+				case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i);
+				case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i);
+				case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i);
+				case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i);
+				case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i);
+				case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i);
+				case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i);
+			}
+			REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17);
+			int n_store = remain_m/2;
+			switch (n_store) {
+				case 15: MASK_STORE_512(1, 6);
+				case 14: MASK_STORE_512(1, 5);
+				case 13: MASK_STORE_512(1, 4);
+				case 12: MASK_STORE_512(1, 3);
+				case 11: MASK_STORE_512(1, 2);
+				case 10: MASK_STORE_512(1, 1);
+				case  9: MASK_STORE_512(1, 0);
+				case  8: MASK_STORE_512(0, 7);
+				case  7: MASK_STORE_512(0, 6);
+				case  6: MASK_STORE_512(0, 5);
+				case  5: MASK_STORE_512(0, 4);
+				case  4: MASK_STORE_512(0, 3);
+				case  3: MASK_STORE_512(0, 2);
+				case  2: MASK_STORE_512(0, 1);
+				case  1: MASK_STORE_512(0, 0);
+			}
+			boffset0 += n_store * remain_n * 2;
+			if (m & 0x1) {
+				__m512i tail;
+				GET_TAIL();
+				_mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail));
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
new file mode 100644
index 000000000..eefbd7355
--- /dev/null
+++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c
@@ -0,0 +1,208 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <immintrin.h>
+#include "common.h"
+
+#define REORDER_4x32(r0, r1, r2, r3) {\
+	__m512i t0, t1, t2, t3; \
+	t0 = _mm512_unpacklo_epi32(r0, r1); \
+	t1 = _mm512_unpackhi_epi32(r0, r1); \
+	t2 = _mm512_unpacklo_epi32(r2, r3); \
+	t3 = _mm512_unpackhi_epi32(r2, r3); \
+	r0 = _mm512_unpacklo_epi64(t0, t2); \
+	r1 = _mm512_unpackhi_epi64(t0, t2); \
+	r2 = _mm512_unpacklo_epi64(t1, t3); \
+	r3 = _mm512_unpackhi_epi64(t1, t3); \
+	t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \
+	t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \
+	t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \
+	t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \
+	r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \
+	r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \
+	r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \
+	r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \
+}
+
+#define REORDER_4x8(r0, r1, r2, r3) {\
+	__m128i t0, t1, t2, t3; \
+	t0 = _mm_unpacklo_epi32(r0, r1); \
+	t1 = _mm_unpackhi_epi32(r0, r1); \
+	t2 = _mm_unpacklo_epi32(r2, r3); \
+	t3 = _mm_unpackhi_epi32(r2, r3); \
+	r0 = _mm_unpacklo_epi64(t0, t2); \
+	r1 = _mm_unpackhi_epi64(t0, t2); \
+	r2 = _mm_unpacklo_epi64(t1, t3); \
+	r3 = _mm_unpackhi_epi64(t1, t3); \
+}
+
+#define GET_TAIL(tail, remain_m) \
+	switch((remain_m + 1)/2) { \
+		case 1: tail = r0; break; \
+		case 2: tail = r1; break; \
+		case 3: tail = r2; break; \
+		case 4: tail = r3; break; \
+	}
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
+	IFLOAT *aoffset;
+	IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3;
+
+	IFLOAT *boffset;
+
+	aoffset = a;
+	boffset = b;
+
+	BLASLONG m32 = m & ~31;
+	BLASLONG m8 = m & ~7;
+	BLASLONG n4 = n & ~3;
+
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+	__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
+	__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
+	__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
+
+	for (j = 0; j < n4; j += 4) {
+		aoffset0  = aoffset;
+		aoffset1  = aoffset0 + lda;
+		aoffset2  = aoffset1 + lda;
+		aoffset3  = aoffset2 + lda;
+		aoffset += 4 * lda;
+
+		for (i = 0; i < m32; i += 32) {
+			__m512i r0, r1, r2, r3;
+			r0 = _mm512_loadu_si512(aoffset0 + i);
+			r1 = _mm512_loadu_si512(aoffset1 + i);
+			r2 = _mm512_loadu_si512(aoffset2 + i);
+			r3 = _mm512_loadu_si512(aoffset3 + i);
+			REORDER_4x32(r0, r1, r2, r3);
+			_mm512_storeu_si512(boffset + 32*0, r0);
+			_mm512_storeu_si512(boffset + 32*1, r1);
+			_mm512_storeu_si512(boffset + 32*2, r2);
+			_mm512_storeu_si512(boffset + 32*3, r3);
+			boffset += 32 * 4;
+		}
+		for (; i < m8; i += 8) {
+			__m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i));
+			__m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i));
+			__m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i));
+			__m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i));
+			REORDER_4x8(r0, r1, r2, r3);
+			_mm_storeu_si128((void *)(boffset + 8*0), r0);
+			_mm_storeu_si128((void *)(boffset + 8*1), r1);
+			_mm_storeu_si128((void *)(boffset + 8*2), r2);
+			_mm_storeu_si128((void *)(boffset + 8*3), r3);
+			boffset += 8 * 4;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask8 r_mask = (1UL << remain_m) - 1;
+			__m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i);
+			__m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i);
+			__m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i);
+			__m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i);
+			REORDER_4x8(r0, r1, r2, r3);
+
+			// store should skip the tail odd line
+			int num_store = remain_m/2;
+			switch(num_store) {
+				case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2);
+				case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1);
+				case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0);
+			}
+			boffset += 8 * num_store;
+
+			if (m & 0x1) { // handling the tail
+				__m128i tail;
+				GET_TAIL(tail, remain_m);
+				/* tail vector is fill with zero like:
+				 *     a, 0, b, 0, c, 0, d, 0
+				 * need to extract lo words of data and store
+				 */
+				tail = _mm_cvtepi32_epi16(tail);
+				_mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid
+				boffset += 4;
+			}
+		}
+	}
+	if (j < n) {
+		int remain_n = n - j;
+		__mmask8 nmask = (1UL << remain_n) - 1;
+		aoffset0  = aoffset;
+		aoffset1  = aoffset0 + lda;
+		aoffset2  = aoffset1 + lda;
+		aoffset3  = aoffset2 + lda;
+		__m128i r0, r1, r2, r3;
+		for (i = 0; i < m8; i += 8) {
+			switch (remain_n) {
+				case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i));
+				case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i));
+				case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i));
+			}
+			REORDER_4x8(r0, r1, r2, r3);
+			_mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
+			_mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
+			_mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
+			_mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3);
+			boffset += 8 * remain_n;
+		}
+		if (i < m) {
+			int remain_m = m - i;
+			__mmask8 mmask = (1UL << remain_m) - 1;
+			switch (remain_n) {
+				case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i);
+				case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i);
+				case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i);
+			}
+			REORDER_4x8(r0, r1, r2, r3);
+
+			int num_store = remain_m/2;
+			switch (num_store) {
+				case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2);
+				case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1);
+				case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0);
+			}
+			boffset += 2 * num_store * remain_n;
+
+			if (m & 0x1) {
+				__m128i tail;
+				GET_TAIL(tail, remain_m);
+				tail = _mm_cvtepi32_epi16(tail);
+				_mm_mask_storeu_epi16(boffset, nmask, tail);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c
new file mode 100644
index 000000000..ccb00ada1
--- /dev/null
+++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c
@@ -0,0 +1,128 @@
+/***************************************************************************
+ * Copyright (c) 2021, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+
+typedef struct {
+	char palette_id;
+	char start_row;
+	char dummy0[14];  // bytes 2-15 reserved, must be zero
+	short tile_colsb[8];
+	char dummy1[16];  // bytes 32-47 reserved, must be zero
+	char tile_rows[8];
+	char dummy2[16];  // bytes 56-63 reserved, must be zero
+} tilecfg;
+
+#define T_16x32 0
+#define T_16xm  1
+#define T_nx32  2
+#define T_nxm   3
+
+#define TCONF(cfg, m, n) \
+	memset(&cfg, 0, sizeof(tilecfg)); \
+	cfg.palette_id = 1; \
+	cfg.tile_rows[T_16x32] = 16; \
+	cfg.tile_colsb[T_16x32] = 64; \
+	if (m) { \
+		cfg.tile_rows[T_16xm] = 16; \
+		cfg.tile_colsb[T_16xm] = m * 2; \
+	} \
+	if (n) { \
+		cfg.tile_rows[T_nx32] = n; \
+		cfg.tile_colsb[T_nx32] = 64; \
+	} \
+	if (m && n) { \
+		cfg.tile_rows[T_nxm] = n; \
+		cfg.tile_colsb[T_nxm] = m * 2; \
+	} \
+	_tile_loadconfig(&cfg);
+
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+	BLASLONG i, j;
+	IFLOAT *aoffset, *boffset;
+	IFLOAT *aoffset0;
+
+	aoffset = a;
+	boffset = b;
+
+	BLASLONG n16 = n & ~15;
+	BLASLONG m32 = m & ~31;
+	BLASLONG m2 = m & ~1;
+
+	BLASLONG tail_m = m2 - m32;
+	BLASLONG tail_n = n - n16;
+	tilecfg cfg;
+	TCONF(cfg, tail_m, tail_n);
+
+	for (j = 0; j < n16; j += 16) {
+		aoffset0  = aoffset;
+		for (i = 0; i < m32; i += 32) {
+			_tile_loadd(T_16x32, aoffset0, lda * 2);
+			_tile_stored(T_16x32, boffset, 32 * 2);
+			aoffset0 += 32;
+			boffset += 32 * 16;
+		}
+		if (i < m2) {
+			_tile_loadd(T_16xm, aoffset0, lda * 2);
+			_tile_stored(T_16xm, boffset, tail_m * 2);
+			aoffset0 += tail_m;
+			boffset += tail_m * 16;
+			i = m2;
+		}
+		if (i < m) {
+			/* the tail odd k should put alone */
+			for (int ii = 0; ii < 16; ii++) {
+				*(boffset + ii) = *(aoffset0 + lda * ii);
+			}
+			boffset += 16;
+		}
+		aoffset += 16 * lda;
+	}
+	if (j < n) {
+		aoffset0  = aoffset;
+		for (i = 0; i < m32; i += 32) {
+			_tile_loadd(T_nx32, aoffset0, lda * 2);
+			_tile_stored(T_nx32, boffset, 32 * 2);
+			aoffset0 += 32;
+			boffset += 32 * tail_n;
+		}
+		if (i < m2) {
+			_tile_loadd(T_nxm, aoffset0, lda * 2);
+			_tile_stored(T_nxm, boffset, tail_m * 2);
+			aoffset0 += tail_m;
+			boffset += tail_m * tail_n;
+		}
+		if (i < m) {
+			for (int ii = 0; ii < tail_n; ii++) {
+				*(boffset + ii) = *(aoffset0 + lda * ii);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_otcopy_16_spr.c b/kernel/x86_64/sbgemm_otcopy_16_spr.c
new file mode 100644
index 000000000..b5d5d38fb
--- /dev/null
+++ b/kernel/x86_64/sbgemm_otcopy_16_spr.c
@@ -0,0 +1,302 @@
+/***************************************************************************
+ * Copyright (c) 2021, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+
+#define LOAD_A_8VEC(aptr)  \
+	r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \
+	r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \
+	r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \
+	r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \
+	r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \
+	r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \
+	r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \
+	r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7));
+
+#define MASK_LOAD_A_8VEC(aptr)  \
+	r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \
+	r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \
+	r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \
+	r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \
+	r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \
+	r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \
+	r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \
+	r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7));
+
+#define SWITCH_LOAD_A_8VEC(aptr, cond) \
+	switch((cond)) { \
+		case 8: r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); \
+		case 7: r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \
+		case 6: r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \
+		case 5: r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \
+		case 4: r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \
+		case 3: r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \
+		case 2: r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \
+		case 1: r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \
+	}
+
+#define SWITCH_MASK_LOAD_A_8VEC(aptr, cond) \
+	switch((cond)) { \
+		case 8: r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); \
+		case 7: r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \
+		case 6: r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \
+		case 5: r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \
+		case 4: r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \
+		case 3: r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \
+		case 2: r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \
+		case 1: r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \
+	}
+
+#define REORDER_8x16(t0, t1, t2, t3, t4, t5, t6, t7) \
+	t0 = _mm256_unpacklo_epi16(r0, r1); \
+	t1 = _mm256_unpackhi_epi16(r0, r1); \
+	t2 = _mm256_unpacklo_epi16(r2, r3); \
+	t3 = _mm256_unpackhi_epi16(r2, r3); \
+	t4 = _mm256_unpacklo_epi16(r4, r5); \
+	t5 = _mm256_unpackhi_epi16(r4, r5); \
+	t6 = _mm256_unpacklo_epi16(r6, r7); \
+	t7 = _mm256_unpackhi_epi16(r6, r7); \
+	r0 = _mm256_unpacklo_epi32(t0, t2); \
+	r1 = _mm256_unpacklo_epi32(t1, t3); \
+	r2 = _mm256_unpacklo_epi32(t4, t6); \
+	r3 = _mm256_unpacklo_epi32(t5, t7); \
+	r4 = _mm256_unpackhi_epi32(t0, t2); \
+	r5 = _mm256_unpackhi_epi32(t1, t3); \
+	r6 = _mm256_unpackhi_epi32(t4, t6); \
+	r7 = _mm256_unpackhi_epi32(t5, t7); \
+	t0 = _mm256_unpacklo_epi64(r0, r2); \
+	t1 = _mm256_unpackhi_epi64(r0, r2); \
+	t2 = _mm256_unpacklo_epi64(r4, r6); \
+	t3 = _mm256_unpackhi_epi64(r4, r6); \
+	t4 = _mm256_unpacklo_epi64(r1, r3); \
+	t5 = _mm256_unpackhi_epi64(r1, r3); \
+	t6 = _mm256_unpacklo_epi64(r5, r7); \
+	t7 = _mm256_unpackhi_epi64(r5, r7);
+
+#define STORE_256_LO(x) \
+	v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \
+	_mm256_storeu_si256((__m256i *)(boffset + x*32), v);
+
+#define STORE_256_HI(x) \
+	v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \
+	_mm256_storeu_si256((__m256i *)(boffset + (x + 8)*32), v);
+
+#define MASK_STORE_256_LO(x) \
+	v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \
+	_mm256_mask_storeu_epi16(boffset + x*m_load, mmask, v);
+
+#define MASK_STORE_256_HI(x) \
+	v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \
+	_mm256_mask_storeu_epi16(boffset + (x + 8)*m_load, mmask, v);
+
+#define STORE_256(x, y) {\
+	__m256i v; \
+	if (x == 0) { STORE_256_LO(y); } \
+	else { STORE_256_HI(y); } \
+}
+
+#define MASK_STORE_256(x, y) {\
+	__m256i v; \
+	if (x == 0) { MASK_STORE_256_LO(y); } \
+	else { MASK_STORE_256_HI(y); } \
+}
+
+#define SWITCH_STORE_16x(cond, func) \
+	switch((cond)) {\
+		case 15: func(1, 6); \
+		case 14: func(1, 5); \
+		case 13: func(1, 4); \
+		case 12: func(1, 3); \
+		case 11: func(1, 2); \
+		case 10: func(1, 1); \
+		case 9: func(1, 0); \
+		case 8: func(0, 7); \
+		case 7: func(0, 6); \
+		case 6: func(0, 5); \
+		case 5: func(0, 4); \
+		case 4: func(0, 3); \
+		case 3: func(0, 2); \
+		case 2: func(0, 1); \
+		case 1: func(0, 0); \
+	}
+
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+	IFLOAT *aoffset, *boffset;
+	IFLOAT *aoffset00, *aoffset01, *aoffset10, *aoffset11;
+	IFLOAT *boffset0;
+
+	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
+	__m256i t00, t01, t02, t03, t04, t05, t06, t07;
+	__m256i t10, t11, t12, t13, t14, t15, t16, t17;
+
+	aoffset = a;
+	boffset = b;
+	BLASLONG n_count = n;
+	BLASLONG m_count = m;
+	for (; n_count > 15; n_count -= 16) {
+		aoffset00 = aoffset;
+		aoffset01 = aoffset00 + 8 * lda;
+		aoffset10 = aoffset01 + 8 * lda;
+		aoffset11 = aoffset10 + 8 * lda;
+		aoffset += 16;
+		m_count = m;
+		for (; m_count > 31; m_count -= 32) {
+			// first 16 rows
+			LOAD_A_8VEC(aoffset00);
+			REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+			LOAD_A_8VEC(aoffset01);
+			REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+			STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3);
+			STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7);
+			STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3);
+			STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7);
+			// last 16 rows
+			boffset += 16;
+			LOAD_A_8VEC(aoffset10);
+			REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+			LOAD_A_8VEC(aoffset11);
+			REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+			STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3);
+			STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7);
+			STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3);
+			STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7);
+			aoffset00 += 32 * lda;
+			aoffset01 += 32 * lda;
+			aoffset10 += 32 * lda;
+			aoffset11 += 32 * lda;
+			boffset += 31 * 16;
+		}
+		if (m_count > 1) {
+			int m_load = m_count & ~1;
+			m_count -= m_load;
+			__mmask16 mmask;
+			SWITCH_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load);
+			REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+			if (m_load > 8) {
+				SWITCH_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8);
+				REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+			}
+			int this_load = m_load > 16 ? 16 : m_load;
+			mmask = (1UL << this_load) - 1;
+			MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3);
+			MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7);
+			MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3);
+			MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7);
+			boffset0 = boffset;
+			if (m_load > 16) {
+				boffset += this_load;
+				SWITCH_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16);
+				REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+				if (m_load > 24) {
+					SWITCH_LOAD_A_8VEC(aoffset11, m_load - 24);
+					REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+				}
+				this_load = m_load - 16;
+				mmask = (1UL << this_load) - 1;
+				MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3);
+				MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7);
+				MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3);
+				MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7);
+			}
+			boffset = boffset0 + 16 * m_load;
+			aoffset00 += m_load * lda;
+		}
+		if (m_count > 0) {
+			// just copy lask K to B directly
+			r0 = _mm256_loadu_si256((__m256i *)(aoffset00));
+			_mm256_storeu_si256((__m256i *)(boffset), r0);
+			boffset += 16;
+		}
+	}
+	if (n_count > 0) {
+		__mmask16 nmask = (1UL << n_count) - 1;
+		aoffset00 = aoffset;
+		aoffset01 = aoffset00 + 8 * lda;
+		aoffset10 = aoffset01 + 8 * lda;
+		aoffset11 = aoffset10 + 8 * lda;
+		m_count = m;
+		for (; m_count > 31; m_count -= 32) {
+			// first 16 rows
+			MASK_LOAD_A_8VEC(aoffset00);
+			REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+			MASK_LOAD_A_8VEC(aoffset01);
+			REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+			SWITCH_STORE_16x(n_count, STORE_256);
+			// last 16 rows
+			boffset0 = boffset;
+			boffset += 16;
+			MASK_LOAD_A_8VEC(aoffset10);
+			REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+			MASK_LOAD_A_8VEC(aoffset11);
+			REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+			SWITCH_STORE_16x(n_count, STORE_256);
+			aoffset00 += 32 * lda;
+			aoffset01 += 32 * lda;
+			aoffset10 += 32 * lda;
+			aoffset11 += 32 * lda;
+			boffset = 32 * n_count + boffset0;
+		}
+		if (m_count > 1) {
+			int m_load = m_count & ~1;
+			m_count -= m_load;
+			__mmask16 mmask;
+			SWITCH_MASK_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load);
+			REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+			if (m_load > 8) {
+				SWITCH_MASK_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8);
+				REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+			}
+			int this_load = m_load > 16 ? 16 : m_load;
+			mmask = (1UL << this_load) - 1;
+			SWITCH_STORE_16x(n_count, MASK_STORE_256);
+			boffset0 = boffset;
+			if (m_load > 16) {
+				boffset += this_load;
+				SWITCH_MASK_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16);
+				REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07);
+				if (m_load > 24) {
+					SWITCH_MASK_LOAD_A_8VEC(aoffset11, m_load - 24);
+					REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17);
+				}
+				this_load = m_load - 16;
+				mmask = (1UL << this_load) - 1;
+				SWITCH_STORE_16x(n_count, MASK_STORE_256);
+			}
+			boffset = boffset0 + n_count * m_load;
+			aoffset00 += m_load * lda;
+		}
+		if (m_count > 0) {
+			// just copy lask K to B directly
+			r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aoffset00));
+			_mm256_mask_storeu_epi16((__m256i *)(boffset), nmask, r0);
+			boffset += 16;
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c
new file mode 100644
index 000000000..ec40a5054
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_NN
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c
new file mode 100644
index 000000000..1cdfd2936
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_NT
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
new file mode 100644
index 000000000..70becd9fa
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c
@@ -0,0 +1,48 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include "sbgemm_block_microk_cooperlake.c"
+// Define micro kernels for ALPHA not ONE scenarios
+#undef  ONE_ALPHA
+#include "sbgemm_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE scenarios
+#define ONE_ALPHA 1
+#include "sbgemm_microk_cooperlake_template.c"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK > 256.0*256.0*256.0)  // disable for big size matrix
+		return 0;
+	/* small matrix kernel works well for N = 8, 16, 32 */
+	if (N == 8 || N == 16 || N == 32)
+		return 1;
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_spr.c b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c
new file mode 100644
index 000000000..98d8ca06a
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include "sbgemm_block_microk_cooperlake.c"
+// Define micro kernels for ALPHA not ONE scenarios
+#undef  ONE_ALPHA
+#include "sbgemm_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE scenarios
+#define ONE_ALPHA 1
+#include "sbgemm_microk_cooperlake_template.c"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
new file mode 100644
index 000000000..1ab7a34ab
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c
@@ -0,0 +1,96 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <memory.h>
+
+extern void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc);
+extern void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc);
+
+extern void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B);
+
+#if defined(TRANS_NN)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_nn_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_nn_alpha
+#elif defined(TRANS_NT)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_nt_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_nt_alpha
+#elif defined(TRANS_TN)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_tn_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_tn_alpha
+#elif defined(TRANS_TT)
+#define SBGEMM_BLOCKING_KERNEL_ONE	sbgemm_blocking_kernel_tt_one
+#define SBGEMM_BLOCKING_KERNEL_ALPHA	sbgemm_blocking_kernel_tt_alpha
+#endif
+
+#define BF16_BLOCK_THRES_K 1024
+// If we want to adjust this to be bigger, need to change COL_MAJOR_INCOPY_KERNEL_Kx32 kernel to be bigger also
+#define BF16_BLOCK_THRES_M 32
+#define BF16_BLOCK_THRES_N 1024
+
+#define MALLOC_ALIGN64(ptr, size, raw_ptr) \
+	raw_ptr = malloc((size) + 63); \
+	ptr = (bfloat16 *)(((uintptr_t) raw_ptr + 63) & ~(uintptr_t)63)
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	bfloat16 * block_A;
+	bfloat16 * block_B;
+	void* raw_ptrA;
+	void* raw_ptrB;
+
+	MALLOC_ALIGN64(block_A, sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M, raw_ptrA);
+	MALLOC_ALIGN64(block_B, sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K, raw_ptrB);
+
+#if defined(B0)
+	sbgemm_zero_operation(M, N, C, ldc);
+#else
+	sbgemm_scal_operation(M, N, beta, C, ldc);
+#endif
+
+	if (alpha == ONE) {
+		SBGEMM_BLOCKING_KERNEL_ONE(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+	} else {
+		SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
+	}
+
+	free(raw_ptrA);
+	free(raw_ptrB);
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c
new file mode 100644
index 000000000..f1a0d0d0c
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_TN
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c
new file mode 100644
index 000000000..8a2a597bc
--- /dev/null
+++ b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c
@@ -0,0 +1,2 @@
+#define TRANS_TT
+#include "sbgemm_small_kernel_template_cooperlake.c"
diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
new file mode 100644
index 000000000..88725f343
--- /dev/null
+++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c
@@ -0,0 +1,164 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <immintrin.h>
+#include "common.h"
+
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
+
+	IFLOAT *boffset0, *boffset1;
+
+	boffset0   = b;
+
+	BLASLONG n32 = n & ~31;
+	BLASLONG m4 = m & ~3;
+	BLASLONG m2 = m & ~1;
+
+	uint32_t permute_table[] = {
+		0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+		0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
+
+	for (j = 0; j < n32; j += 32) {
+		/* process 2x16 n at the same time */
+		boffset1 = boffset0 + m * 16;
+		for (i = 0; i < m4; i += 4) {
+			/* bf16 fma need special memory layout:
+			 * for memory layout like below:
+			 *     a00, a01, a02, a03, a04, a05 ....
+			 *     a10, a11, a12, a13, a14, a15 ....
+			 * need to copy as:
+			 *     a00, a10, a01, a11, a02, a12, a03, a13, ...
+			 */
+			__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+			__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
+			__m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]);
+			__m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]);
+
+			__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+			__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+			__m512i a10 = _mm512_unpacklo_epi16(a2, a3);
+			__m512i a11 = _mm512_unpackhi_epi16(a2, a3);
+
+			a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+			a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+			a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11);
+			a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11);
+
+			_mm512_storeu_si512(boffset0, a0);
+			_mm512_storeu_si512(boffset1, a1);
+			_mm512_storeu_si512(boffset0 + 32, a2);
+			_mm512_storeu_si512(boffset1 + 32, a3);
+			boffset0 += 64;
+			boffset1 += 64;
+		}
+		for (; i < m2; i += 2) {
+			__m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]);
+			__m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]);
+
+			__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+			__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+			a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+			a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+
+			_mm512_storeu_si512(boffset0, a0);
+			_mm512_storeu_si512(boffset1, a1);
+			boffset0 += 32;
+			boffset1 += 32;
+		}
+		for (; i < m; i++) {
+			/* just copy the only remains row */
+			__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
+			__m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]);
+			_mm256_storeu_si256((void *)boffset0, a0);
+			_mm256_storeu_si256((void *)boffset1, a1);
+			boffset0 += 16;
+			boffset1 += 16;
+		}
+		boffset0 = boffset1;
+	}
+	if (j < n) {
+		uint32_t remains = n - j;
+		__mmask32 r_mask = (1UL << remains) - 1;
+		if (remains > 16) {
+			boffset1 = boffset0 + m * 16;
+			uint32_t tail1 = remains - 16;
+			__mmask16 w_mask1 = (1UL << tail1) - 1;
+			for (i = 0; i < m2; i += 2) {
+				__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+
+				__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+				__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+				a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+				a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01);
+
+				_mm512_storeu_si512(boffset0, a0);
+				_mm512_mask_storeu_epi32(boffset1, w_mask1, a1);
+
+				boffset0 += 32;
+				boffset1 += 2 * tail1;
+			}
+			for (; i < m; i++) {
+				__m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]);
+				__m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]);
+				_mm256_storeu_si256((void *)boffset0, a0);
+				_mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1);
+				boffset0 += 16;
+				boffset1 += tail1;
+			}
+		} else {
+			__mmask16 w_mask = (1UL << remains ) - 1;
+			for (i = 0; i < m2; i += 2) {
+				__m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+
+				__m512i a00 = _mm512_unpacklo_epi16(a0, a1);
+				__m512i a01 = _mm512_unpackhi_epi16(a0, a1);
+
+				a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01);
+
+				_mm512_mask_storeu_epi32(boffset0, w_mask, a0);
+				boffset0 += 2 * remains;
+			}
+			for (; i < m; i++) {
+				__m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]);
+				_mm256_mask_storeu_epi16(boffset0, w_mask, a0);
+				boffset0 += remains;
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
new file mode 100644
index 000000000..e9edd4571
--- /dev/null
+++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c
@@ -0,0 +1,216 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <immintrin.h>
+#include "common.h"
+
+#define STORE_VEC(Bx, By, vec) \
+	if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \
+	else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2));
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+	BLASLONG i, j;
+
+	IFLOAT *boffset0, *boffset1;
+
+	boffset0   = b;
+
+	BLASLONG n24 = n - (n % 24);
+	BLASLONG n8 = n & ~7;
+	BLASLONG m8 = m & ~7;
+	BLASLONG m4 = m & ~3;
+	BLASLONG m2 = m & ~1;
+
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+		0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+
+	j = 0;
+	if (n > 23) {
+		/* n = 24 is the max width in current blocking setting */
+		__m512i idx_lo_128 = _mm512_loadu_si512(permute_table);
+		__m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16);
+		__m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32);
+		__m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48);
+		__mmask32 mask24 = (1UL << 24) - 1;
+		BLASLONG blk_size = m * 4;
+		BLASLONG stride = blk_size * 3;
+
+		for (; j < n24; j += 24) {
+			boffset1 = boffset0 + stride;
+			for (i = 0; i < m8; i += 8) {
+				__m512i r0, r1, r2, r3, r4, r5, r6, r7;
+				__m512i t0, t1, t2, t3, t4, t5, t6, t7;
+				r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
+				r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
+				r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]);
+				r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]);
+				r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]);
+				r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]);
+				r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]);
+				r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]);
+
+				t0 = _mm512_unpacklo_epi16(r0, r1);
+				t1 = _mm512_unpackhi_epi16(r0, r1);
+				t2 = _mm512_unpacklo_epi16(r2, r3);
+				t3 = _mm512_unpackhi_epi16(r2, r3);
+				t4 = _mm512_unpacklo_epi16(r4, r5);
+				t5 = _mm512_unpackhi_epi16(r4, r5);
+				t6 = _mm512_unpacklo_epi16(r6, r7);
+				t7 = _mm512_unpackhi_epi16(r6, r7);
+
+				r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2);
+				r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3);
+				r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6);
+				r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7);
+				r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2);
+				r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3);
+				r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6);
+				r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7);
+
+				t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2);
+				t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3);
+				t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6);
+				t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7);
+				t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2);
+				t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3);
+
+				STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2);
+				STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5);
+				boffset0 += 32;
+				boffset1 += 32;
+			}
+			for (; i < m2; i += 2) {
+				__m512i r0, r1, t0, t1;
+				r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]);
+				r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]);
+				t0 = _mm512_unpacklo_epi16(r0, r1);
+				t1 = _mm512_unpackhi_epi16(r0, r1);
+				STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0));
+				STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0));
+				STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1));
+				STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1));
+				STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2));
+				STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2));
+				boffset0 += 8;
+				boffset1 += 8;
+			}
+			for (; i < m; i++) {
+				*(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0];
+				*(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4];
+				*(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8];
+				*(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12];
+				*(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16];
+				*(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20];
+				boffset0 += 4;
+				boffset1 += 4;
+			}
+			boffset0 += stride * 2;
+		}
+	}
+
+	for (; j < n8; j += 8) {
+		boffset1 = boffset0 + m * 4;
+		for (i = 0; i < m4; i += 4) {
+			__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
+			__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
+			__m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]);
+			__m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]);
+			__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+			__m128i a01 = _mm_unpackhi_epi16(a0, a1);
+			__m128i a10 = _mm_unpacklo_epi16(a2, a3);
+			__m128i a11 = _mm_unpackhi_epi16(a2, a3);
+			_mm_storeu_si128((void *)(boffset0 + 0), a00);
+			_mm_storeu_si128((void *)(boffset0 + 8), a10);
+			_mm_storeu_si128((void *)(boffset1 + 0), a01);
+			_mm_storeu_si128((void *)(boffset1 + 8), a11);
+			boffset0 += 16;
+			boffset1 += 16;
+		}
+		for (; i < m2; i+= 2) {
+			__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
+			__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
+			__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+			__m128i a01 = _mm_unpackhi_epi16(a0, a1);
+			_mm_storeu_si128((void *)(boffset0 + 0), a00);
+			_mm_storeu_si128((void *)(boffset1 + 0), a01);
+			boffset0 += 8;
+			boffset1 += 8;
+		}
+		for (; i < m; i++) {
+			__m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]);
+			_mm_store_sd((void *)boffset0, a0);
+			_mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1));
+			boffset0 += 4;
+			boffset1 += 4;
+		}
+		boffset0 = boffset1;
+	}
+	if (j < n) {
+		uint32_t remains = n - j;
+		__mmask8 r_mask = (1UL << remains) - 1;
+		if (remains > 4) {
+			boffset1 = boffset0 + m * 4;
+			uint32_t tail1 = remains - 4;
+			__mmask8 w_mask1 = (1UL << tail1) - 1;
+			for (i = 0; i < m2; i += 2) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+				__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+				__m128i a01 = _mm_unpackhi_epi16(a0, a1);
+				_mm_storeu_si128((void *)boffset0, a00);
+				_mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01);
+				boffset0 += 8;
+				boffset1 += 2 * tail1;
+			}
+			for (; i < m; i++) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				_mm_store_sd((void *)boffset0, (__m128d) a0);
+				_mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1));
+				boffset0 += 4;
+				boffset1 += tail1;
+			}
+		} else {
+			for (i = 0; i < m2; i += 2) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
+				__m128i a00 = _mm_unpacklo_epi16(a0, a1);
+				_mm_mask_storeu_epi32((void *)boffset0, r_mask, a00);
+				boffset0 += 2 * remains;
+			}
+			for (; i < m; i++) {
+				__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
+				_mm_mask_storeu_epi16((void *)boffset0, r_mask, a0);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c
index 18e64dc3f..08ccace61 100644
--- a/kernel/x86_64/sbgemv_n.c
+++ b/kernel/x86_64/sbgemv_n.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined (COOPERLAKE)
+#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "sbgemv_n_microk_cooperlake.c"
 #endif
 
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
index 46e6d0ff9..4711e9720 100644
--- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
@@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Include common macros for BF16 based operations with IA intrinsics
 #include "bf16_common_macros.h"
 
+#undef STORE16_COMPLETE_RESULT
+#undef STORE16_MASK_COMPLETE_RESULT
+#undef STORE8_COMPLETE_RESULT
+#undef STORE8_MASK_COMPLETE_RESULT
+#undef STORE4_COMPLETE_RESULT
+#undef STORE4_MASK_COMPLETE_RESULT
+
 #ifndef ZERO_BETA  // Beta is non-zero
 
 #ifndef ONE_BETA       // BETA is not ONE
@@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
@@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
         unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
         __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
 
-        unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+        unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
         __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
 
         accum512_0 = _mm512_setzero_ps();
diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c
index 22b099116..51ea0d937 100644
--- a/kernel/x86_64/sbgemv_t.c
+++ b/kernel/x86_64/sbgemv_t.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if defined (COOPERLAKE)
+#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "sbgemv_t_microk_cooperlake.c"
 #endif
 
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
index 51e681add..8a3a022fb 100644
--- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
@@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // Include common macros for BF16 based operations with IA intrinsics
 #include "bf16_common_macros.h"
 
+#undef STORE16_COMPLETE_RESULT
+#undef STORE16_MASK_COMPLETE_RESULT
+#undef STORE8_COMPLETE_RESULT
+#undef STORE8_MASK_COMPLETE_RESULT
+#undef STORE4_COMPLETE_RESULT
+#undef STORE4_MASK_COMPLETE_RESULT
+
 #ifndef ZERO_BETA  // Beta is non-zero
 
 #ifndef ONE_BETA       // BETA is not ONE
@@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
@@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     } else if (tail_num == 8) {
         __m256 result256 = _mm256_setzero_ps();
 
-        __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
+        __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
         __m256i xArray256 = _mm512_castsi512_si256(xArray);
         result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
 
@@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
@@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
             result256_0 = _mm256_setzero_ps();
             result256_1 = _mm256_setzero_ps();
 
-            matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
-            matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
-            matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
+            matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+            matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+            matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
 
             matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
             matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
@@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
             if (tail_num > 10) {
                 unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
                 __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
-                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
-                matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+                matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
                 matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]);  // Load m-tag_m_32x-10 rows
 
                 matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
@@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
             } else if (tail_num > 5) {
                 unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
                 __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
-                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
                 matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]);   // Load m-tag_m_32x-5 rows
                 matrixArray256_2 = _mm256_setzero_si256();
 
@@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
@@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512  result_0, result_1;
@@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
@@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
 
             result256_0 = _mm256_setzero_ps();
 
-            matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
-            matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
-            matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
+            matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
 
             // Process the 0|1 elements
             // Select the 0|1 elements for each row
@@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
@@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
 {
     BLASLONG tag_m_16x  = m & (~15);
 
-    __m128i x128 = _mm_loadu_si128(x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128 = _mm_loadu_si128((__m128i *)x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
 
     if (tag_m_16x > 0) {
         __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
@@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
@@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m128  result128, tmp128;
         for (BLASLONG i = tag_m_16x; i < m; i++) {
             result128 = _mm_setzero_ps();
-            matrixArray128 = _mm_loadu_si128(&a[(i)*8]);       // Load 1 rows with n=8
+            matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]);       // Load 1 rows with n=8
             result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
             tmp128 = _mm_shuffle_ps(result128, result128, 14);
             result128 = _mm_add_ps(result128, tmp128);
@@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|0 |0 | 0| 0| 0| 0| 0|
 
     if (tag_m_14x > 0) {
@@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x,
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m256i M256_EPI16_2 = _mm256_set1_epi16(2);
@@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 
     unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8));           // |x8|x9|0 | 0| 0| 0| 0| 0|
 
     if (tag_m_12x > 0) {
@@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
@@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10| 0| 0| 0| 0| 0|
 
     if (tag_m_15x > 0) {
@@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
@@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 
     unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
     __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
-    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
+    __m128i x128_0 = _mm_loadu_si128((__m128i *)x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
     __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10|x11| 0| 0| 0| 0|
 
     if (tag_m_15x > 0) {
@@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
@@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
         __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
         __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
 
-        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
-        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
-
         // Prepare X with 2-step interleave way
         xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
         BF16_INTERLEAVE_1x32(xArray)
@@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
@@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
         __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
         __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
 
-        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
-        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
-
         // Prepare X with 2-step interleave way
         xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
         BF16_INTERLEAVE_1x32(xArray)
@@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
 {
     BLASLONG tag_m_16x  = m & (~15);
 
-    __m256i x256 = _mm256_loadu_si256(x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
+    __m256i x256 = _mm256_loadu_si256((__m256i *)x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
 
     if (tag_m_16x > 0) {
         __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
@@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
         __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
         __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
@@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x
         __m128  accum128, tmp128;
         for (BLASLONG i = tag_m_16x; i < m; i++) {
             accum256 = _mm256_setzero_ps();
-            matrixArray256 = _mm256_loadu_si256(&a[(i)*16]);       // Load 1 rows with n=16
+            matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]);       // Load 1 rows with n=16
             accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
             accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
             tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
@@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
@@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
     BLASLONG tag_n_32x  = n & (~31);
     BLASLONG tag_n_128x = n & (~127);
 
-    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
-           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
     __m512 accum512_bridge[8];
     __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
     __m256 accum256_0;
@@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3;
@@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
     __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
 #endif
 
     __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
@@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
     __m512  ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
 #endif
 #ifndef ZERO_BETA
+#ifndef ONE_BETA
     __m512  BETAVECTOR  = _mm512_castps256_ps512(_mm256_set1_ps(beta));
+#endif
 #endif
 
     __m256  accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
@@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16
             __m128  accum128, tmp128;
             for (BLASLONG i = tag_m_8x; i < m; i++) {
                 accum256_0 = _mm256_setzero_ps();
-                matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]);       // Load 1 rows with n=16
+                matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]);       // Load 1 rows with n=16
                 accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
                 accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
                 tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
index e816c67e9..a0acea9d1 100644
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sdot_microk_nehalem-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "sdot_microk_haswell-2.c"
-#elif  defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif  defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "sdot_microk_skylakex-2.c"
 #elif defined(SANDYBRIDGE)
 #include "sdot_microk_sandy-2.c"
diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c
index 1c29c1168..6217acf48 100644
--- a/kernel/x86_64/sgemm_beta_skylakex.c
+++ b/kernel/x86_64/sgemm_beta_skylakex.c
@@ -41,7 +41,7 @@
 #include <immintrin.h>
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
-	  FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
+	  IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
 	  FLOAT *c, BLASLONG ldc){
 
   BLASLONG i, j;
diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
index aaadcf151..badeb0fbf 100644
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,8 +1,11 @@
 /* the direct sgemm code written by Arjan van der Ven */
-#include <immintrin.h>
 #include "common.h"
 
-#if defined(SKYLAKEX) || defined (COOPERLAKE)
+#if defined(SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+
+#include <immintrin.h>
+
+
 /*
  * "Direct sgemm" code. This code operates directly on the inputs and outputs
  * of the sgemm call, avoiding the copies, memory realignments and threading,
@@ -469,7 +472,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG s
 	}
 }
 #else
-#include "common.h"
+
 void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {}
 #endif
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
index f3d614242..2db8b2fea 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c
@@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3};
     BLASLONG n_count = n;
     float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+#if defined(__clang__)
+    for(;n_count>23;n_count-=24) COMPUTE(24)
+#else
     for(;n_count>23;n_count-=24) COMPUTE_n24
+#endif    
     for(;n_count>19;n_count-=20) COMPUTE(20)
     for(;n_count>15;n_count-=16) COMPUTE(16)
     for(;n_count>11;n_count-=12) COMPUTE(12)
diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
new file mode 100644
index 000000000..cea63172b
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -0,0 +1,617 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)])
+#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)])
+#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \
+	_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#endif
+
+#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]);
+#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k])
+#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k])
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512 r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \
+	r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \
+	t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \
+	t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \
+	r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \
+	__m128 s0, s1, s2, s3; \
+	s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \
+	s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \
+	s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0);
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N);
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	_mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \
+}
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M];
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	s1 = _mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4); \
+	s0 = _mm_fmadd_ps(s1, beta_128, s0); \
+	_mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \
+}
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m64 = M & ~63;
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+
+	for (i = 0; i < m64; i += 64) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m32; i += 32) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (!mm) return 0;
+	if (mm > 8 || K < 32) {
+		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	} else {
+		/* M => [1, 8]
+		 *
+		 * This kernel use dot-like style to calc a value - C(x, y):
+		 * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y)
+		 *
+		 * Alloc a buf to copy rest of A as row major,
+		 * so memory access from 0 to K is continuous for both A & B.
+		 *
+		 * Loading to zmm and FMA 16 of k at one loop,
+		 * finally reduce_add zmm to a single float result in C(x, y).
+		 *
+		 * Note: performance is bad when K is small.
+		 */
+		FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K);
+		__mmask8 mask8 = (1UL << mm) - 1;
+		__mmask16 mask;
+		BLASLONG k16 = K & ~15;
+		BLASLONG k8 = K & ~7;
+		for (k = 0; k < k8; k += 8) {
+			__m256  r0, r1, r2, r3, r4, r5, r6, r7;
+			__m256  t0, t1, t2, t3, t4, t5, t6, t7;
+			r0 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(0 + k)]);
+			r1 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(1 + k)]);
+			r2 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(2 + k)]);
+			r3 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(3 + k)]);
+			r4 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(4 + k)]);
+			r5 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(5 + k)]);
+			r6 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(6 + k)]);
+			r7 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(7 + k)]);
+
+			t0 = _mm256_unpacklo_ps(r0, r1);
+			t1 = _mm256_unpackhi_ps(r0, r1);
+			t2 = _mm256_unpacklo_ps(r2, r3);
+			t3 = _mm256_unpackhi_ps(r2, r3);
+			t4 = _mm256_unpacklo_ps(r4, r5);
+			t5 = _mm256_unpackhi_ps(r4, r5);
+			t6 = _mm256_unpacklo_ps(r6, r7);
+			t7 = _mm256_unpackhi_ps(r6, r7);
+
+			r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));
+			r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));
+			r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));
+			r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));
+			r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));
+			r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));
+			r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));
+			r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));
+
+			t0 = _mm256_permute2f128_ps(r0, r4, 0x20);
+			t1 = _mm256_permute2f128_ps(r1, r5, 0x20);
+			t2 = _mm256_permute2f128_ps(r2, r6, 0x20);
+			t3 = _mm256_permute2f128_ps(r3, r7, 0x20);
+			t4 = _mm256_permute2f128_ps(r0, r4, 0x31);
+			t5 = _mm256_permute2f128_ps(r1, r5, 0x31);
+			t6 = _mm256_permute2f128_ps(r2, r6, 0x31);
+			t7 = _mm256_permute2f128_ps(r3, r7, 0x31);
+
+			switch (mm) {
+				case 8: _mm256_storeu_ps(&mbuf[k + 7*K], t7);
+				case 7: _mm256_storeu_ps(&mbuf[k + 6*K], t6);
+				case 6: _mm256_storeu_ps(&mbuf[k + 5*K], t5);
+				case 5: _mm256_storeu_ps(&mbuf[k + 4*K], t4);
+				case 4: _mm256_storeu_ps(&mbuf[k + 3*K], t3);
+				case 3: _mm256_storeu_ps(&mbuf[k + 2*K], t2);
+				case 2: _mm256_storeu_ps(&mbuf[k + 1*K], t1);
+				case 1: _mm256_storeu_ps(&mbuf[k + 0*K], t0);
+			}
+		}
+		for (; k < K; k++) {
+			for (int ii = 0; ii < mm; ii++) {
+				mbuf[k + ii*K] = A[i + lda*k + ii];
+			}
+		}
+		int mi = 0;
+		mask8 = 0xff;  // just use to avoid SSE instruction
+		__m128 alpha_128 = _mm_broadcast_ss(&alpha);
+#if !defined(B0)
+		__m128 beta_128 = _mm_broadcast_ss(&beta);
+#endif
+		__m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0);
+		for (; i < m4; i += 4, mi += 4) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				STORE_REDUCE_M4(0);
+			}
+
+		}
+		for (; i < m2; i += 2, mi += 2) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+				STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			}
+		}
+		for (; i < M; i += 1, mi += 1) {
+			for (j = 0; j < n4; j += 4) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				STORE_REDUCE_N4(0);
+			}
+			for (; j < n2; j += 2) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				STORE_REDUCE(0, 0);
+				STORE_REDUCE(0, 1);
+
+			}
+			for (; j < N; j += 1) {
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < k16; k += 16) {
+					LOAD_KA_512(0, x);
+					LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				int remains = K - k;
+				if (remains) {
+					mask = (1UL << remains) - 1;
+					MASK_LOAD_KA_512(0, x);
+					MASK_LOAD_KB_512(x, 0);
+
+					MATMUL_512(0, 0);
+				}
+				STORE_REDUCE(0, 0);
+			}
+		}
+		free(mbuf);
+	}
+	return 0;
+}
+#else
+#include "../generic/gemm_small_matrix_kernel_nn.c"
+#endif
+
diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
new file mode 100644
index 000000000..a7d87f8c4
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c
@@ -0,0 +1,535 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)])
+#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)])
+#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N]))
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+
+#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[lda * k + i + M]))
+#define LOAD_B_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)])
+#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)])
+#if defined(B0)
+#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+			_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				_mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4)
+#else
+#define STORE_512(M, N) \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \
+	_mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N)
+#define MASK_STORE_512(M, N) \
+	result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+	asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \
+	_mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4);
+#endif
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m64 = M & ~63;
+	BLASLONG m32 = M & ~31;
+	BLASLONG m16 = M & ~15;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n64 = N & ~63;
+	BLASLONG n32 = N & ~31;
+	BLASLONG n8 = N & ~7;
+	BLASLONG n6 = N - (N % 6);
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+
+	for (i = 0; i < m64; i += 64) {
+		for (j = 0; j < n6; j += 6) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5);
+
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+			STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2);
+			STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3);
+			STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4);
+			STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+			STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0);
+		}
+	}
+	for (; i < m32; i += 32) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4);
+			DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5);
+			DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6);
+			DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+				MATMUL_512(0, 4); MATMUL_512(1, 4);
+				MATMUL_512(0, 5); MATMUL_512(1, 5);
+				MATMUL_512(0, 6); MATMUL_512(1, 6);
+				MATMUL_512(0, 7); MATMUL_512(1, 7);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+			STORE_512(0, 4); STORE_512(1, 4);
+			STORE_512(0, 5); STORE_512(1, 5);
+			STORE_512(0, 6); STORE_512(1, 6);
+			STORE_512(0, 7); STORE_512(1, 7);
+		}
+		for (;j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+			STORE_512(0, 2); STORE_512(1, 2);
+			STORE_512(0, 3); STORE_512(1, 3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+			STORE_512(0, 1); STORE_512(1, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x); LOAD_A_512(1, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0); STORE_512(1, 0);
+		}
+	}
+	for (; i < m16; i += 16) {
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+			STORE_512(0, 4);
+			STORE_512(0, 5);
+			STORE_512(0, 6);
+			STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+	}
+	int mm = M - i;
+	if (mm >= 12) {
+		register __mmask16 mask asm("k1") = (1UL << mm) - 1;
+		for (j = 0; j < n8; j += 8) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			DECLARE_RESULT_512(0, 4);
+			DECLARE_RESULT_512(0, 5);
+			DECLARE_RESULT_512(0, 6);
+			DECLARE_RESULT_512(0, 7);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+				BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5);
+				BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+				MATMUL_512(0, 4);
+				MATMUL_512(0, 5);
+				MATMUL_512(0, 6);
+				MATMUL_512(0, 7);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+			MASK_STORE_512(0, 4);
+			MASK_STORE_512(0, 5);
+			MASK_STORE_512(0, 6);
+			MASK_STORE_512(0, 7);
+		}
+		for (; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+			MASK_STORE_512(0, 2);
+			MASK_STORE_512(0, 3);
+		}
+
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1);
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			MASK_STORE_512(0, 0);
+			MASK_STORE_512(0, 1);
+		}
+		for (; j < N; j++) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < K; k++) {
+				MASK_LOAD_A_512(0, x);
+				BROADCAST_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0);
+			}
+			MASK_STORE_512(0, 0);
+		}
+	} else if (mm > 0) {
+		int index_n[16];
+		for (int ii = 0; ii < 16; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
+		for (; i < m4; i += 4) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1);
+			}
+			__mmask16 mask = 0xffff;
+			for (; j < N; j += 16) {
+				int remains = N - j;
+				if (remains < 16) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0);
+			}
+		}
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask16 mask = 0xffff;
+			for (; j < N; j += 16) {
+				int remains = N - j;
+				if (remains < 16) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					LOAD_B_512(x, 2);
+					LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0);
+					LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask16 mask = 0xffff;
+			for (; j < N; j += 16) {
+				int remains = N - j;
+				if (remains < 16) mask = (1UL << remains) - 1;
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
new file mode 100644
index 000000000..cbf2374bd
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c
@@ -0,0 +1,53 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
+{
+	double MNK = (double) M * (double) N * (double) K;
+	if (MNK > 100.0*100.0*100.0)  // disable for big size matrix
+		return 0;
+	// tuning for A transpose
+	if (transa) {
+		if (transb) {
+			/* TT kernel perform not good when:
+			 * 1. K is too small.
+			 */
+			if (K < 4) return 0;
+		} else {
+			/* TN kernel perform not good when:
+			 * 1. C matrix is too big
+			 * 2. K is too small
+			 */
+			if (M * N > 1200 || K < 32)
+				return 0;
+		}
+	}
+
+	return 1;
+}
diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
new file mode 100644
index 000000000..308f5e35e
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
@@ -0,0 +1,321 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+#include <memory.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+
+#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[(i + M)*lda + k]);
+#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k])
+#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[(i + M)*lda + k])
+#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k])
+
+#define REDUCE_4(rr0, rr1, rr2, rr3) \
+	__m512 r0, r1, r2, r3, t0, t1, t2, t3;\
+	r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \
+	r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \
+	t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \
+	t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \
+	r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \
+	__m128 s0, s1, s2, s3; \
+	s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \
+	s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \
+	s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0);
+
+#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N)
+#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3)
+
+#if defined(B0)
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N)
+#define STORE_M4(N, s0) _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0);
+#define STORE_N4(M, s0) _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4);
+#else
+#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]
+#define STORE_M4(N, s0) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0);
+
+#define STORE_N4(M, s0) \
+	s0 = _mm_fmadd_ps(_mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4), beta_128, s0); \
+	_mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4);
+#endif
+#define STORE_REDUCE_M4(N) {\
+	REDUCE_M4(N) \
+	STORE_M4(N, s0) \
+}
+#define STORE_REDUCE_N4(M) {\
+	REDUCE_N4(M) \
+	STORE_N4(M, s0) \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n4 = N & ~3;
+	BLASLONG n2 = N & ~1;
+
+	BLASLONG k16 = K & ~15;
+
+	__mmask16 mask;
+	__mmask8 mask8 = 0xff;  // just use to avoid SSE instruction
+
+	__m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0);
+	__m128 alpha_128 = _mm_broadcast_ss(&alpha);
+#if !defined(B0)
+	__m128 beta_128 = _mm_broadcast_ss(&beta);
+#endif
+	for (i = 0; i < m4; i += 4) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			STORE_REDUCE_M4(0); STORE_REDUCE_M4(1);
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			STORE_REDUCE_M4(0);
+		}
+
+	}
+	for (; i < m2; i += 2) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3);
+			}
+			STORE_REDUCE_N4(0); STORE_REDUCE_N4(1);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+			STORE_REDUCE(0, 1); STORE_REDUCE(1, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x); LOAD_KA_512(1, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0); MATMUL_512(1, 0);
+			}
+			STORE_REDUCE(0, 0); STORE_REDUCE(1, 0);
+		}
+	}
+	for (; i < M; i += 1) {
+		for (j = 0; j < n4; j += 4) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3);
+
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_REDUCE_N4(0);
+		}
+		for (; j < n2; j += 2) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0); LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_REDUCE(0, 0);
+			STORE_REDUCE(0, 1);
+
+		}
+		for (; j < N; j += 1) {
+			DECLARE_RESULT_512(0, 0);
+			for (k = 0; k < k16; k += 16) {
+				LOAD_KA_512(0, x);
+				LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			int remains = K - k;
+			if (remains) {
+				mask = (1UL << remains) - 1;
+				MASK_LOAD_KA_512(0, x);
+				MASK_LOAD_KB_512(x, 0);
+
+				MATMUL_512(0, 0);
+			}
+			STORE_REDUCE(0, 0);
+		}
+	}
+	return 0;
+}
+#else
+#include "../generic/gemm_small_matrix_kernel_tn.c"
+#endif
+
diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
new file mode 100644
index 000000000..023f58746
--- /dev/null
+++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c
@@ -0,0 +1,414 @@
+/***************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <immintrin.h>
+#include "common.h"
+#include <stdio.h>
+
+#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps()
+#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k  + lda * (i+M)]))
+#define LOAD_B_512(M,N)  __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)])
+#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)])
+#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N)
+
+#if defined(B0)
+#define STORE_8xy(v, N, x, y) _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				    _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4);
+#else
+#define STORE_8xy(v, N, x, y) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*8)*ldc + i]), "v"(beta_256)); \
+	_mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v)
+#define STORE_4xy(v, N, x, y) \
+	asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*4)*ldc + i]), "v"(beta_128)); \
+	_mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v)
+#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4);
+#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \
+				__m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \
+				result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \
+				_mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4);
+#endif
+
+#define REORDER_8x16(r0, r1, r2, r3, r4, r5, r6, r7) \
+	__m512 t0, t1, t2, t3, t4, t5, t6, t7, v; \
+	t0 = _mm512_unpacklo_ps(r0, r1); \
+	t1 = _mm512_unpackhi_ps(r0, r1); \
+	t2 = _mm512_unpacklo_ps(r2, r3); \
+	t3 = _mm512_unpackhi_ps(r2, r3); \
+	t4 = _mm512_unpacklo_ps(r4, r5); \
+	t5 = _mm512_unpackhi_ps(r4, r5); \
+	t6 = _mm512_unpacklo_ps(r6, r7); \
+	t7 = _mm512_unpackhi_ps(r6, r7); \
+	v = _mm512_shuffle_ps(t0, t2, 0x4E);  \
+	r0 = _mm512_mask_blend_ps(kc, t0, v); \
+	r1 = _mm512_mask_blend_ps(k3, t2, v); \
+	v = _mm512_shuffle_ps(t1, t3, 0x4E);  \
+	r2 = _mm512_mask_blend_ps(kc, t1, v); \
+	r3 = _mm512_mask_blend_ps(k3, t3, v); \
+	v = _mm512_shuffle_ps(t4, t6, 0x4E);  \
+	r4 = _mm512_mask_blend_ps(kc, t4, v); \
+	r5 = _mm512_mask_blend_ps(k3, t6, v); \
+	v = _mm512_shuffle_ps(t5, t7, 0x4E);  \
+	r6 = _mm512_mask_blend_ps(kc, t5, v); \
+	r7 = _mm512_mask_blend_ps(k3, t7, v); \
+	t0 = _mm512_permutex2var_ps(r0, idx_lo, r4); \
+	t1 = _mm512_permutex2var_ps(r1, idx_lo, r5); \
+	t2 = _mm512_permutex2var_ps(r2, idx_lo, r6); \
+	t3 = _mm512_permutex2var_ps(r3, idx_lo, r7); \
+	t4 = _mm512_permutex2var_ps(r0, idx_hi, r4); \
+	t5 = _mm512_permutex2var_ps(r1, idx_hi, r5); \
+	t6 = _mm512_permutex2var_ps(r2, idx_hi, r6); \
+	t7 = _mm512_permutex2var_ps(r3, idx_hi, r7); \
+	t0 = _mm512_mul_ps(t0, alpha_512); \
+	t1 = _mm512_mul_ps(t1, alpha_512); \
+	t2 = _mm512_mul_ps(t2, alpha_512); \
+	t3 = _mm512_mul_ps(t3, alpha_512); \
+	t4 = _mm512_mul_ps(t4, alpha_512); \
+	t5 = _mm512_mul_ps(t5, alpha_512); \
+	t6 = _mm512_mul_ps(t6, alpha_512); \
+	t7 = _mm512_mul_ps(t7, alpha_512);
+
+#define SAVE_8(N, x, y) {\
+	__m256 v8 = _mm512_extractf32x8_ps(t##x, y); \
+	STORE_8xy(v8, N, x, y); \
+}
+
+#define REORDER_STORE_8x16(N) {\
+	REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	SAVE_8(N, 0, 0); SAVE_8(N, 1, 0); SAVE_8(N, 2, 0); SAVE_8(N, 3, 0); SAVE_8(N, 4, 0); SAVE_8(N, 5, 0); SAVE_8(N, 6, 0); SAVE_8(N, 7, 0); \
+	SAVE_8(N, 0, 1); SAVE_8(N, 1, 1); SAVE_8(N, 2, 1); SAVE_8(N, 3, 1); SAVE_8(N, 4, 1); SAVE_8(N, 5, 1); SAVE_8(N, 6, 1); SAVE_8(N, 7, 1); \
+}
+
+#define MASK_SAVE_8() \
+	switch (nn) { \
+		case 16: SAVE_8(0, 7, 1); \
+		case 15: SAVE_8(0, 6, 1); \
+		case 14: SAVE_8(0, 5, 1); \
+		case 13: SAVE_8(0, 4, 1); \
+		case 12: SAVE_8(0, 3, 1); \
+		case 11: SAVE_8(0, 2, 1); \
+		case 10: SAVE_8(0, 1, 1); \
+		case 9: SAVE_8(0, 0, 1); \
+		case 8: SAVE_8(0, 7, 0); \
+		case 7: SAVE_8(0, 6, 0); \
+		case 6: SAVE_8(0, 5, 0); \
+		case 5: SAVE_8(0, 4, 0); \
+		case 4: SAVE_8(0, 3, 0); \
+		case 3: SAVE_8(0, 2, 0); \
+		case 2: SAVE_8(0, 1, 0); \
+		case 1: SAVE_8(0, 0, 0); \
+	}
+
+#define MASK_REORDER_STORE_8x16(N) {\
+	REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \
+	MASK_SAVE_8(); \
+}
+
+#define REORDER_4x16(r0, r1, r2, r3) \
+	__m512 t0, t1, t2, t3, v; \
+	t0 = _mm512_unpacklo_ps(r0, r1); \
+	t1 = _mm512_unpackhi_ps(r0, r1); \
+	t2 = _mm512_unpacklo_ps(r2, r3); \
+	t3 = _mm512_unpackhi_ps(r2, r3); \
+	v = _mm512_shuffle_ps(t0, t2, 0x4E);  \
+	r0 = _mm512_mask_blend_ps(kc, t0, v); \
+	r1 = _mm512_mask_blend_ps(k3, t2, v); \
+	v = _mm512_shuffle_ps(t1, t3, 0x4E);  \
+	r2 = _mm512_mask_blend_ps(kc, t1, v); \
+	r3 = _mm512_mask_blend_ps(k3, t3, v); \
+	t0 = _mm512_mul_ps(r0, alpha_512); \
+	t1 = _mm512_mul_ps(r1, alpha_512); \
+	t2 = _mm512_mul_ps(r2, alpha_512); \
+	t3 = _mm512_mul_ps(r3, alpha_512);
+
+#define SAVE_4(N, x, y) {\
+	__m128 v4 = _mm512_extractf32x4_ps(t##x, y); \
+	STORE_4xy(v4, N, x, y); \
+}
+
+#define REORDER_STORE_4x16(N) {\
+	REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \
+	SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \
+	SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \
+	SAVE_4(N, 0, 2); SAVE_4(N, 1, 2); SAVE_4(N, 2, 2); SAVE_4(N, 3, 2); \
+	SAVE_4(N, 0, 3); SAVE_4(N, 1, 3); SAVE_4(N, 2, 3); SAVE_4(N, 3, 3); \
+}
+
+#define MASK_SAVE_4() \
+	switch (nn) { \
+		case 16: SAVE_4(0, 3, 3); \
+		case 15: SAVE_4(0, 2, 3); \
+		case 14: SAVE_4(0, 1, 3); \
+		case 13: SAVE_4(0, 0, 3); \
+		case 12: SAVE_4(0, 3, 2); \
+		case 11: SAVE_4(0, 2, 2); \
+		case 10: SAVE_4(0, 1, 2); \
+		case 9: SAVE_4(0, 0, 2); \
+		case 8: SAVE_4(0, 3, 1); \
+		case 7: SAVE_4(0, 2, 1); \
+		case 6: SAVE_4(0, 1, 1); \
+		case 5: SAVE_4(0, 0, 1); \
+		case 4: SAVE_4(0, 3, 0); \
+		case 3: SAVE_4(0, 2, 0); \
+		case 2: SAVE_4(0, 1, 0); \
+		case 1: SAVE_4(0, 0, 0); \
+	}
+
+#define MASK_REORDER_STORE_4x16(N) {\
+	REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \
+	MASK_SAVE_4(); \
+}
+
+
+#if defined(B0)
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
+#else
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
+#endif
+{
+	// column major
+	BLASLONG i, j, k;
+
+	BLASLONG m8 = M & ~7;
+	BLASLONG m4 = M & ~3;
+	BLASLONG m2 = M & ~1;
+
+	BLASLONG n64 = N & ~63;
+	BLASLONG n32 = N & ~31;
+
+	__m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha));
+#if !defined(B0)
+	__m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta));
+	__m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+	int permute_table[] = {
+		0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b,
+		0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f,
+	};
+	__m512i idx_lo = _mm512_loadu_si512(permute_table);
+	__m512i idx_hi = _mm512_loadu_si512(permute_table + 16);
+	__mmask16 kc = 0xcccc;
+	__mmask16 k3 = 0x3333;
+	__mmask8 mask8 = 0xff;  // force use AVX128 instead of SSE
+
+	for (i = 0; i < m8; i += 8) {
+		for (j = 0; j < n32; j += 32) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1);
+			}
+			REORDER_STORE_8x16(0);
+			REORDER_STORE_8x16(1);
+		}
+		__mmask16 mask = 0xffff;
+		int nn = 16;
+		for (; j < N; j += 16) {
+			if (N - j < 16) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0);
+			}
+			MASK_REORDER_STORE_8x16(0);
+		}
+	}
+	for (; i < m4; i += 4) {
+		for (j = 0; j < n64; j += 64) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+				MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2);
+				MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3);
+			}
+			REORDER_STORE_4x16(0);
+			REORDER_STORE_4x16(1);
+			REORDER_STORE_4x16(2);
+			REORDER_STORE_4x16(3);
+		}
+		for (; j < n32; j += 32) {
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+				MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1);
+			}
+			REORDER_STORE_4x16(0);
+			REORDER_STORE_4x16(1);
+		}
+		__mmask16 mask = 0xffff;
+		int nn = 16;
+		for (; j < N; j += 16) {
+			if (N - j < 16) {
+				nn = N - j;
+				mask = (1UL << nn) - 1;
+			}
+			DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0);
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x);
+				MASK_LOAD_B_512(x, 0);
+				MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0);
+			}
+			MASK_REORDER_STORE_4x16(0);
+		}
+	}
+	if (i < M) {
+		int index_n[16];
+		for (int ii = 0; ii < 16; ii++) {
+			index_n[ii] = ii * ldc;
+		}
+		__m512i vindex_n = _mm512_loadu_si512(index_n);
+#if !defined(B0)
+		__m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta));
+#endif
+		for (; i < m2; i += 2) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2);
+				DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+					MATMUL_512(0, 2); MATMUL_512(1, 2);
+					MATMUL_512(0, 3); MATMUL_512(1, 3);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+				SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2);
+				SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+					MATMUL_512(0, 1); MATMUL_512(1, 1);
+				}
+				SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0);
+				SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1);
+			}
+			__mmask16 mask = 0xffff;
+			int nn = 16;
+			for (; j < N; j += 16) {
+				if (N - j < 16) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0); MATMUL_512(1, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0);
+			}
+		}
+		for (; i < M; i += 1) {
+			for (j = 0; j < n64; j += 64) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				DECLARE_RESULT_512(0, 2);
+				DECLARE_RESULT_512(0, 3);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+					MATMUL_512(0, 2);
+					MATMUL_512(0, 3);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+				SCATTER_STORE_512(0, 2);
+				SCATTER_STORE_512(0, 3);
+			}
+			for (; j < n32; j += 32) {
+				DECLARE_RESULT_512(0, 0);
+				DECLARE_RESULT_512(0, 1);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					LOAD_B_512(x, 0); LOAD_B_512(x, 1);
+					MATMUL_512(0, 0);
+					MATMUL_512(0, 1);
+				}
+				SCATTER_STORE_512(0, 0);
+				SCATTER_STORE_512(0, 1);
+			}
+			__mmask16 mask = 0xffff;
+			int nn = 16;
+			for (; j < N; j += 16) {
+				if (N - j < 16) {
+					nn = N - j;
+					mask = (1UL << nn) - 1;
+				}
+				DECLARE_RESULT_512(0, 0);
+				for (k = 0; k < K; k++) {
+					BROADCAST_LOAD_A_512(0, x);
+					MASK_LOAD_B_512(x, 0);
+					MATMUL_512(0, 0);
+				}
+				MASK_SCATTER_STORE_512(0, 0);
+			}
+		}
+	}
+	return 0;
+}
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 3eec21774..621ddc622 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -35,8 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_n_microk_nehalem-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_n_microk_sandy-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN)
 #include "sgemv_n_microk_haswell-4.c"
+#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+#include "sgemv_n_microk_haswell-4.c"
+#include "sgemv_n_microk_skylakex-8.c"
 #endif
 
 #if defined(STEAMROLLER)  || defined(EXCAVATOR)
@@ -112,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
 	
 #endif
 
+#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL				
+
 #ifndef HAVE_KERNEL_4x2
 
 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@@ -167,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
 
 } 
 
+#endif
 #endif
 
 #ifndef HAVE_KERNEL_4x1
@@ -291,6 +297,38 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
+    if ( m < 1 || n < 1) return(0);
+
+    #ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL
+    if (m <= 16384 && n <= 48 && !(n == 4))
+    {
+        FLOAT * xbuffer_align = x;
+        FLOAT * ybuffer_align = y;
+
+        if (inc_x != 1) {
+            xbuffer_align = buffer;
+            for(BLASLONG i=0; i<n; i++) {
+                 xbuffer_align[i] = x[i*inc_x];
+            }
+        }
+
+        if (inc_y != 1) {
+            ybuffer_align = buffer + n;
+            for(BLASLONG i=0; i<m; i++) {
+                ybuffer_align[i] = y[i*inc_y];
+            }
+        }
+        sgemv_kernel_n_128(m, n , alpha, a, lda, xbuffer_align, ybuffer_align);
+
+        if(inc_y != 1) {
+            for(BLASLONG i=0; i<m; i++) {
+                   y[i*inc_y] = ybuffer_align[i];
+            }
+        }
+        return(0);
+    }
+
+    #endif
 	BLASLONG i;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
@@ -305,9 +343,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG lda8 =  lda << 3;
 	FLOAT xbuffer[8],*ybuffer;
 
-        if ( m < 1 ) return(0);
-        if ( n < 1 ) return(0);
-
 	ybuffer = buffer;
 	
         if ( inc_x == 1 )
@@ -322,10 +357,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 	}
 	
-        m3 = m & 3  ;
-        m1 = m & -4 ;
-        m2 = (m & (NBMAX-1)) - m3 ;
-
+    m3 = m & 3  ;
+    m1 = m & -4 ;
+    m2 = (m & (NBMAX-1)) - m3 ;
 
 	y_ptr = y;
 
@@ -383,7 +417,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 			if ( n2 & 2 )
 			{
+#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL				
+				sgemv_kernel_n_64(NB, 2, alpha, a_ptr, lda, x_ptr, ybuffer);
+#else
 				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+#endif
 				a_ptr += lda*2;
 				x_ptr += 2;	
 			}
@@ -391,7 +429,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
 			if ( n2 & 1 )
 			{
+#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL
+				sgemv_kernel_n_64(NB, 1, alpha, a_ptr, lda, x_ptr, ybuffer);
+#else
 				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
+#endif
 				/* a_ptr += lda;
 				x_ptr += 1a; */
 
diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
new file mode 100644
index 000000000..fba9cedcd
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
@@ -0,0 +1,258 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#define HAVE_SGEMV_N_SKYLAKE_KERNEL 1
+#include "common.h"
+#include <immintrin.h>
+static int sgemv_kernel_n_128(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y)
+{
+    __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7;
+    __m512 xArray_0;
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+    BLASLONG tag_m_128x = m & (~127);
+    BLASLONG tag_m_64x = m & (~63);
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_16x = m & (~15);
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+        accum512_2 = _mm512_setzero_ps();
+        accum512_3 = _mm512_setzero_ps();
+        accum512_4 = _mm512_setzero_ps();
+        accum512_5 = _mm512_setzero_ps();
+        accum512_6 = _mm512_setzero_ps();
+        accum512_7 = _mm512_setzero_ps();
+
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_ps(x[idx_n]);
+
+            matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]);
+            matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]);
+            matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]);
+            matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]);
+            matrixArray_4 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 64]);
+            matrixArray_5 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 80]);
+            matrixArray_6 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 96]);
+            matrixArray_7 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 112]);
+
+            accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0);
+            accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1);
+            accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2);
+            accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3);
+            accum512_4 = _mm512_fmadd_ps(matrixArray_4, xArray_0, accum512_4);
+            accum512_5 = _mm512_fmadd_ps(matrixArray_5, xArray_0, accum512_5);
+            accum512_6 = _mm512_fmadd_ps(matrixArray_6, xArray_0, accum512_6);
+            accum512_7 = _mm512_fmadd_ps(matrixArray_7, xArray_0, accum512_7);
+        }
+
+        _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+        _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16])));
+        _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32])));
+        _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48])));
+        _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(accum512_4, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 64])));
+        _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(accum512_5, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 80])));
+        _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(accum512_6, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 96])));
+        _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(accum512_7, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 112])));
+    }
+    if (tag_m_128x != m) {
+        for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+            accum512_2 = _mm512_setzero_ps();
+            accum512_3 = _mm512_setzero_ps();
+    
+            for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                xArray_0 = _mm512_set1_ps(x[idx_n]);
+    
+                matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]);
+                matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]);
+                matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]);
+                matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]);
+    
+                accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0);
+                accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1);
+                accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2);
+                accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3);
+            }
+    
+            _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+            _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16])));
+            _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32])));
+            _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48])));
+        }
+
+        if(tag_m_64x != m) {
+            for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) {
+                accum512_0 = _mm512_setzero_ps();
+                accum512_1 = _mm512_setzero_ps();
+        
+                for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                    xArray_0 = _mm512_set1_ps(x[idx_n]);
+        
+                    matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]);
+                    matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]);
+        
+                    accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0);
+                    accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1);
+                }
+        
+                _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+                _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16])));
+            }
+
+            if(tag_m_32x != m) {    
+        
+                for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) {
+                    accum512_0 = _mm512_setzero_ps();
+            
+                    for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                        xArray_0 = _mm512_set1_ps(x[idx_n]);
+            
+                        matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]);
+            
+                        accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0);
+                    }
+            
+                    _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+                }       
+            
+                if (tag_m_16x != m) {
+                    accum512_0 = _mm512_setzero_ps();
+            
+                    unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+                    __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+            
+                    for(BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                        xArray_0 = _mm512_set1_ps(x[idx_n]);
+                        matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_16x]);
+            
+                        accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0);
+                    }
+            
+                    _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x])));
+            
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y)
+{
+    __m256 ma0, ma1, ma2, ma3, ma4, ma5, ma6, ma7;
+    __m256 as0, as1, as2, as3, as4, as5, as6, as7;
+    __m256 alphav = _mm256_set1_ps(alpha);
+    __m256 xv;
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_16x = m & (~15);
+    BLASLONG tag_m_8x = m & (~7);
+    __mmask8 one_mask = 0xff;
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+        as0 = _mm256_setzero_ps();
+        as1 = _mm256_setzero_ps();
+        as2 = _mm256_setzero_ps();
+        as3 = _mm256_setzero_ps();
+
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xv = _mm256_set1_ps(x[idx_n]);
+            ma0 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]);
+            ma1 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]);
+            ma2 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +16]);
+            ma3 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +24]);
+
+            as0 = _mm256_maskz_fmadd_ps(one_mask, ma0, xv, as0);
+            as1 = _mm256_maskz_fmadd_ps(one_mask, ma1, xv, as1);
+            as2 = _mm256_maskz_fmadd_ps(one_mask, ma2, xv, as2);
+            as3 = _mm256_maskz_fmadd_ps(one_mask, ma3, xv, as3);
+        }
+        _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as0, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m])));
+        _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as1, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8])));
+        _mm256_mask_storeu_ps(&y[idx_m + 16], one_mask, _mm256_maskz_fmadd_ps(one_mask, as2, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 16])));
+        _mm256_mask_storeu_ps(&y[idx_m + 24], one_mask, _mm256_maskz_fmadd_ps(one_mask, as3, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 24])));
+ 
+    }    
+ 
+    if (tag_m_32x != m ) {
+            for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) {
+            as4 = _mm256_setzero_ps();
+            as5 = _mm256_setzero_ps();
+    
+            for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                xv = _mm256_set1_ps(x[idx_n]);
+                ma4 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]);
+                ma5 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]);
+    
+                as4 = _mm256_maskz_fmadd_ps(one_mask, ma4, xv, as4);
+                as5 = _mm256_maskz_fmadd_ps(one_mask, ma5, xv, as5);
+            }
+            _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as4, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m])));
+            _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as5, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8])));
+        }
+    
+        if (tag_m_16x != m ) {
+            for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) {
+                as6 = _mm256_setzero_ps();
+    
+                for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                    xv = _mm256_set1_ps(x[idx_n]);
+                    ma6 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m]);
+                    as6 = _mm256_maskz_fmadd_ps(one_mask, ma6, xv, as6);
+                }
+                _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as6, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m])));
+            }
+        
+            if (tag_m_8x != m) {
+                as7 = _mm256_setzero_ps();
+    
+                unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(m&7)));
+                __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint);
+    
+                for(BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+                    xv = _mm256_set1_ps(x[idx_n]);
+                    ma7 = _mm256_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_8x]);
+    
+                    as7 = _mm256_maskz_fmadd_ps(tail_mask, ma7, xv, as7);
+                }
+    
+                _mm256_mask_storeu_ps(&y[tag_m_8x], tail_mask, _mm256_maskz_fmadd_ps(tail_mask, as7, alphav, _mm256_maskz_loadu_ps(tail_mask, &y[tag_m_8x])));
+    
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+#endif
\ No newline at end of file
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index fe886f57f..0be2c7e97 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -34,8 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "sgemv_t_microk_bulldozer-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_t_microk_sandy-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN)
 #include "sgemv_t_microk_haswell-4.c"
+#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+#include "sgemv_t_microk_haswell-4.c"
+#include "sgemv_t_microk_skylakex.c"
 #endif
 
 #if defined(STEAMROLLER) || defined(EXCAVATOR)
@@ -305,6 +308,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);
 
+	#ifdef HAVE_SGEMV_T_SKYLAKE_KERNEL
+    if (lda == m && n <= 16384 && m <= 8)
+    {
+        FLOAT * xbuffer_align = x;
+        FLOAT * ybuffer_align = y;
+
+        if (inc_x != 1) {
+            xbuffer_align = buffer;
+            for(BLASLONG i=0; i<m; i++) {
+                 xbuffer_align[i] = x[i*inc_x];
+            }
+        }
+
+        if (inc_y != 1) {
+            ybuffer_align = buffer + m;
+            for(BLASLONG i=0; i<n; i++) {
+                ybuffer_align[i] = y[i*inc_y];
+            }
+        }
+        sgemv_kernel_t(m, n , alpha, a, xbuffer_align, ybuffer_align);
+
+        if(inc_y != 1) {
+            for(BLASLONG i=0; i<n; i++) {
+                   y[i*inc_y] = ybuffer_align[i];
+            }
+        }
+        return(0);
+    }
+
+    #endif
+
 	xbuffer = buffer;
 	ytemp   = buffer + (m < NBMAX ? m : NBMAX);
 	
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c
new file mode 100644
index 000000000..dca12acfc
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_skylakex.c
@@ -0,0 +1,60 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#define HAVE_SGEMV_T_SKYLAKE_KERNEL 1
+#include "common.h"
+#include <immintrin.h>
+#include "sgemv_t_microk_skylakex_template.c"
+
+//sgemv_t:
+// ----- m -----
+// |<-----------
+// |<-----------
+// n
+// |<-----------
+// |<-----------
+
+static int sgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, float *a, float *x, float *y)
+{    
+    switch(m) {
+        case 1:  sgemv_kernel_t_1(n, alpha, a, x, y); break;
+        case 2:  sgemv_kernel_t_2(n, alpha, a, x, y); break;
+        case 3:  sgemv_kernel_t_3(n, alpha, a, x, y); break;
+        case 4:  sgemv_kernel_t_4(n, alpha, a, x, y); break;
+        case 5:  sgemv_kernel_t_5(n, alpha, a, x, y); break;
+        case 6:  sgemv_kernel_t_6(n, alpha, a, x, y); break;
+        case 7:  sgemv_kernel_t_7(n, alpha, a, x, y); break;
+        case 8:  sgemv_kernel_t_8(n, alpha, a, x, y); break;
+        default: break;
+    }
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c
new file mode 100644
index 000000000..7f2144353
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c
@@ -0,0 +1,1121 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+
+//Here the m means n in sgemv_t:
+// ----- n -----
+// |
+// |
+// m
+// |
+// |
+static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    //printf("enter into t_1 kernel\n");
+    //printf("m = %ld\n", m);
+    __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+    float alphaX = alpha * (*x);
+    __m512  ALPHAXVECTOR = _mm512_set1_ps(alphaX);
+    
+    BLASLONG tag_m_128x = m & (~127);
+    BLASLONG tag_m_64x = m & (~63);
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_16x = m & (~15);
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
+            matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]);
+            matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]);
+            matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]);
+            matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]);
+            matrixArray_4 = _mm512_loadu_ps(&a[idx_m + 64]);
+            matrixArray_5 = _mm512_loadu_ps(&a[idx_m + 80]);
+            matrixArray_6 = _mm512_loadu_ps(&a[idx_m + 96]);
+            matrixArray_7 = _mm512_loadu_ps(&a[idx_m + 112]);
+            
+        _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+        _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16])));
+        _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32])));
+        _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48])));
+        _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(matrixArray_4, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 64])));
+        _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(matrixArray_5, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 80])));
+        _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(matrixArray_6, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 96])));
+        _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(matrixArray_7, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 112])));
+
+    }
+
+    if (tag_m_128x != m) {
+        for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) {
+            matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]);
+            matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]);
+            matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]);
+            matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]);
+            
+            _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+            _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16])));
+            _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32])));
+            _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48])));
+  
+        }
+
+        if (tag_m_64x != m) {
+            for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) {
+                matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]);
+                matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]);
+
+                _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+                _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16])));
+ 
+            }
+
+            if (tag_m_32x != m) {
+                for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) {
+                    matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]);
+            
+                    _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0])));
+                }
+            
+                if (tag_m_16x != m) {
+                    unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+                    __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                    matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[tag_m_16x]);
+
+                    _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x])));
+
+                }
+
+ 
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    __m512 m0, m1, m2, m3, col0_1, col0_2, col1_1, col1_2, x1Array, x2Array;
+    float x1a = x[0] * alpha;
+    float x2a = x[1] * alpha;
+    x1Array = _mm512_set1_ps(x1a);
+    x2Array = _mm512_set1_ps(x2a);
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_16x = m & (~15);
+    BLASLONG tag_m_8x = m & (~7);
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1);
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+        m0 = _mm512_loadu_ps(&a[idx_m*2]);
+        m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]);
+        m2 = _mm512_loadu_ps(&a[idx_m*2 + 32]);
+        m3 = _mm512_loadu_ps(&a[idx_m*2 + 48]);
+        col0_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1);
+        col0_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1);
+        col1_1 = _mm512_permutex2var_ps(m2, idx_base_0, m3);
+        col1_2 = _mm512_permutex2var_ps(m2, idx_base_1, m3);
+
+        _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col0_2, _mm512_mul_ps(col0_1, x1Array)), _mm512_loadu_ps(&y[idx_m])));
+        _mm512_storeu_ps(&y[idx_m + 16], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m + 16])));
+    }
+    if (tag_m_32x != m) {
+        for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) {
+            m0 = _mm512_loadu_ps(&a[idx_m*2]);
+            m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]);
+            col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1);
+            col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1);
+            _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m])));
+        }
+        if (tag_m_16x != m) {
+            __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+            unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
+            __mmask8 load_mask = *((__mmask8*) &load_mask_value);
+            x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x));
+            for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) {
+                m0 = _mm512_loadu_ps(&a[idx_m*2]);
+                m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR);
+                m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1);
+                __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0));
+                _mm256_storeu_ps(&y[idx_m], _mm256_add_ps(ret, _mm256_loadu_ps(&y[idx_m])));
+                 
+            }
+
+            if (tag_m_8x != m) {
+                unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(((m-tag_m_8x)*2)&15)));
+                __mmask16 a_mask = *((__mmask16*) &tail_mask_value);
+                unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x)));
+                __mmask8 y_mask = *((__mmask8*) &y_mask_value);
+
+                m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x*2]);
+                m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR);
+                m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1);
+                __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0));
+                _mm256_mask_storeu_ps(&y[tag_m_8x], y_mask, _mm256_add_ps(ret, _mm256_maskz_loadu_ps(y_mask, &y[tag_m_8x])));
+            }                  
+        }        
+    }
+    return 0;
+}
+
+static int sgemv_kernel_t_3(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    __m512 m0, m1, m2, c1, c2, c3, tmp, x1Array, x2Array, x3Array;
+    float x1a = x[0] * alpha;
+    float x2a = x[1] * alpha;
+    float x3a = x[2] * alpha;
+    x1Array = _mm512_set1_ps(x1a);
+    x2Array = _mm512_set1_ps(x2a);
+    x3Array = _mm512_set1_ps(x3a);
+    BLASLONG tag_m_16x = m & (~15);
+    BLASLONG tag_m_8x = m & (~7);
+    BLASLONG tag_m_4x = m & (~3);
+    BLASLONG tag_m_2x = m & (~1);
+
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i M512_EPI32_s1 = _mm512_set1_epi32(-1);
+    __m512i idx_c1_1 = _mm512_set_epi32(0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+    __m512i idx_c2_1 = _mm512_add_epi32(idx_c1_1, M512_EPI32_1);
+    __m512i idx_c3_1 = _mm512_add_epi32(idx_c2_1, M512_EPI32_1);
+
+    __m512i idx_c3_2 = _mm512_set_epi32(31, 28, 25, 22, 19, 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m512i idx_c2_2 = _mm512_add_epi32(idx_c3_2, M512_EPI32_s1);
+    __m512i idx_c1_2 = _mm512_add_epi32(idx_c2_2, M512_EPI32_s1);
+
+    __mmask16 step_1 = 0x07ff;
+    __mmask16 step_2 = 0xf800;
+    __mmask16 c31 = 0x03ff;
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+        m0 = _mm512_loadu_ps(&a[idx_m*3]);
+        m1 = _mm512_loadu_ps(&a[idx_m*3 + 16]);
+        m2 = _mm512_loadu_ps(&a[idx_m*3 + 32]);
+
+        tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c1_1, m1);
+        c1 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c1_2, m2);
+        tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c2_1, m1);
+        c2 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c2_2, m2);
+        tmp = _mm512_mask_permutex2var_ps(m0, c31, idx_c3_1, m1);
+        c3 = _mm512_permutex2var_ps(tmp, idx_c3_2, m2);
+
+        tmp = _mm512_fmadd_ps(x2Array, c2, _mm512_mul_ps(c1, x1Array));
+        _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x3Array, c3, tmp), _mm512_loadu_ps(&y[idx_m])));
+    }
+
+    if(tag_m_16x != m) {
+        __mmask8 a_mask = 0xff;
+        __m256i M256_EPI32_1 = _mm256_maskz_set1_epi32(a_mask, 1);
+        __m256i M256_EPI32_s1 = _mm256_maskz_set1_epi32(a_mask, -1);
+        __m256i idx_c1_1 = _mm256_set_epi32(0, 0, 15, 12, 9, 6, 3, 0);
+        __m256i idx_c2_1 = _mm256_add_epi32(idx_c1_1, M256_EPI32_1);
+        __m256i idx_c3_1 = _mm256_add_epi32(idx_c2_1, M256_EPI32_1);
+
+        __m256i idx_c3_2 = _mm256_set_epi32(15, 12, 9, 0, 0, 0, 0, 0);
+        __m256i idx_c2_2 = _mm256_add_epi32(idx_c3_2, M256_EPI32_s1);
+        __m256i idx_c1_2 = _mm256_add_epi32(idx_c2_2, M256_EPI32_s1);
+
+        __mmask8 step_1 = 0x1f;
+        __mmask8 step_2 = 0xe0;
+        __mmask8 c12 = 0xc0;
+        
+        __m256 m256_0, m256_1, m256_2, tmp256, c256_1, c256_2, c256_3, x256_1, x256_2, x256_3;
+        x256_1 = _mm256_set1_ps(x1a);
+        x256_2 = _mm256_set1_ps(x2a);
+        x256_3 = _mm256_set1_ps(x3a);
+
+        for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) {
+            m256_0 = _mm256_loadu_ps(&a[idx_m*3]);
+            m256_1 = _mm256_loadu_ps(&a[idx_m*3 + 8]);
+            m256_2 = _mm256_loadu_ps(&a[idx_m*3 + 16]);
+
+            tmp256 = _mm256_permutex2var_ps(m256_0, idx_c1_1, m256_1);
+            c256_1 = _mm256_mask_permutex2var_ps(tmp256, c12, idx_c1_2, m256_2);
+            tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c2_1, m256_1);
+            c256_2 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c2_2, m256_2);
+            tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c3_1, m256_1);
+            c256_3 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c3_2, m256_2);
+
+            tmp256 = _mm256_fmadd_ps(x256_2, c256_2, _mm256_mul_ps(c256_1, x256_1));
+            _mm256_storeu_ps(&y[idx_m], _mm256_maskz_add_ps(a_mask, _mm256_fmadd_ps(x256_3, c256_3, tmp256), _mm256_loadu_ps(&y[idx_m])));
+        }
+
+        if(tag_m_8x != m){
+            for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){
+                m0 = _mm512_maskz_loadu_ps(0x0fff, &a[tag_m_8x*3]);
+                m256_0 = _mm512_extractf32x8_ps(m0, 0);
+                m256_1 = _mm512_extractf32x8_ps(m0, 1);
+                __m256i idx1 = _mm256_set_epi32(10, 7, 4, 1, 9, 6, 3, 0);
+                __m256i M256_EPI32_2 = _mm256_maskz_set1_epi32(0x0f, 2);
+                __m256i idx2 = _mm256_add_epi32(idx1, M256_EPI32_2);
+
+                c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1);
+                c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0x0f, idx2, m256_1);
+
+                __m128 c128_1 = _mm256_extractf32x4_ps(c256_1, 0);
+                __m128 c128_2 = _mm256_extractf32x4_ps(c256_1, 1);
+                __m128 c128_3 = _mm256_extractf32x4_ps(c256_2, 0);
+
+                __m128 x128_1 = _mm_set1_ps(x1a);
+                __m128 x128_2 = _mm_set1_ps(x2a);
+                __m128 x128_3 = _mm_set1_ps(x3a);
+
+                __m128 tmp128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_2, x128_2));
+                _mm_mask_storeu_ps(&y[idx_m], 0x0f, _mm_maskz_add_ps(0x0f, _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, tmp128), _mm_maskz_loadu_ps(0x0f, &y[idx_m])));
+            }
+
+            if(tag_m_4x != m) {
+                for (BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2) {
+                    m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*3]);
+                    __m128 a128_1 = _mm256_extractf32x4_ps(m256_0, 0);
+                    __m128 a128_2 = _mm256_extractf32x4_ps(m256_0, 1);
+                    __m128 x128 = _mm_maskz_loadu_ps(0x07, x);
+
+                    __m128i idx128_1= _mm_set_epi32(0, 2, 1, 0);
+                    __m128i M128_EPI32_3 = _mm_maskz_set1_epi32(0x07, 3);
+                    __m128i idx128_2 = _mm_add_epi32(idx128_1, M128_EPI32_3);
+
+                    __m128 c128_1 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_1, a128_2);
+                    __m128 c128_2 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_2, a128_2);
+
+                    __m128 tmp128 = _mm_hadd_ps(_mm_maskz_mul_ps(0x07, c128_1, x128), _mm_maskz_mul_ps(0x07, c128_2, x128));
+                    float ret[4];
+                    _mm_mask_storeu_ps(ret, 0x0f, tmp128);
+                    y[idx_m] += alpha *(ret[0] + ret[1]);
+                    y[idx_m+1] += alpha * (ret[2] + ret[3]);
+                }
+
+                if(tag_m_2x != m) {
+                    y[tag_m_2x] += alpha*(a[tag_m_2x*3]*x[0] + a[tag_m_2x*3+1]*x[1] + a[tag_m_2x*3+2]*x[2]);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    BLASLONG tag_m_4x = m & (~3);
+    BLASLONG tag_m_2x = m & (~1);
+    __m512 m0, m1;
+    __m256 m256_0, m256_1, c256_1, c256_2;
+    __m128 c1, c2, c3, c4, ret;
+    __m128 xarray = _mm_maskz_loadu_ps(0x0f, x);
+    __m512 x512 = _mm512_broadcast_f32x4(xarray);
+    __m512 alphavector = _mm512_set1_ps(alpha);
+    __m512 xa512 = _mm512_mul_ps(x512, alphavector);
+    __m256i idx1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0);
+    __m256i idx2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2);
+
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_4x; idx_m+=4) {
+        m0 = _mm512_loadu_ps(&a[idx_m*4]);
+        m1 = _mm512_mul_ps(m0, xa512);
+        m256_0 = _mm512_extractf32x8_ps(m1, 0);
+        m256_1 = _mm512_extractf32x8_ps(m1, 1);
+        c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1);
+        c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx2, m256_1);
+
+        c1 = _mm256_extractf32x4_ps(c256_1, 0);
+        c2 = _mm256_extractf32x4_ps(c256_1, 1);
+        c3 = _mm256_extractf32x4_ps(c256_2, 0);
+        c4 = _mm256_extractf32x4_ps(c256_2, 1);
+
+        ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, &y[idx_m]));
+        _mm_mask_storeu_ps(&y[idx_m], 0xff, ret);
+    }
+    
+    if(tag_m_4x != m) {
+        float result[4];
+        for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x; idx_m+=2) {
+            m256_0 = _mm256_maskz_loadu_ps(0xff, &a[idx_m*4]);
+            c1 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 0);
+            c2 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 1);
+
+            c3 = _mm_maskz_mul_ps(0x0f, c1, xarray);
+            c4 = _mm_maskz_mul_ps(0x0f, c2, xarray);
+
+            ret = _mm_hadd_ps(c3, c4);            
+            _mm_mask_storeu_ps(result, 0x0f, ret);
+            y[idx_m] += alpha *(result[0] + result[1]);
+            y[idx_m+1] += alpha * (result[2] + result[3]);
+        }
+
+        if(tag_m_2x != m ) {
+            c1 = _mm_maskz_loadu_ps(0x0f, &a[tag_m_2x * 4]);
+            c2 = _mm_maskz_mul_ps(0x0f, c1, xarray);
+            _mm_mask_storeu_ps(result, 0x0f, c2);
+            y[tag_m_2x] += alpha *(result[0] + result[1] + result[2] + result[3]);
+        }
+    }
+
+    return 0;
+}
+
+static int sgemv_kernel_t_5(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    BLASLONG tag_m_16x = m & (~15);
+    BLASLONG tag_m_8x = m & (~7);
+    BLASLONG tag_m_4x = m & (~3);
+    BLASLONG tag_m_2x = m & (~1);
+    __m512 m0, m1, m2, m3, m4, tmp0, tmp1, tmp2, accum, c0, c1, c2, c3, c4;
+    __m512 x0_512 = _mm512_set1_ps(x[0]);
+    __m512 x1_512 = _mm512_set1_ps(x[1]);
+    __m512 x2_512 = _mm512_set1_ps(x[2]);
+    __m512 x3_512 = _mm512_set1_ps(x[3]);
+    __m512 x4_512 = _mm512_set1_ps(x[4]);
+    __m512 alpha_512 = _mm512_set1_ps(alpha);
+
+
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i M512_EPI32_16 = _mm512_set1_epi32(16);
+    __m512i M512_EPI32_0 = _mm512_setzero_epi32();
+
+    __m512i idx_c0 = _mm512_set_epi32(27, 22, 17, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 0);
+    __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1);
+    __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1);
+    idx_c2 = _mm512_mask_blend_epi32(0x0040, idx_c2, M512_EPI32_0);
+    __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1);
+    __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1);
+    idx_c4 = _mm512_mask_blend_epi32(0x1000, idx_c4, M512_EPI32_16);
+
+    for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) {      
+        m0 = _mm512_loadu_ps(&a[idx_m*5]);
+        m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]);
+        m2 = _mm512_loadu_ps(&a[idx_m*5 + 32]);
+        m3 = _mm512_loadu_ps(&a[idx_m*5 + 48]);
+        m4 = _mm512_loadu_ps(&a[idx_m*5 + 64]);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c0, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c0, m3);
+        c0 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1);
+        c0 = _mm512_mask_permutex2var_ps(c0, 0xe000, idx_c0, m4);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c1, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c1, m3);
+        c1 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1);
+        c1 = _mm512_mask_permutex2var_ps(c1, 0xe000, idx_c1, m4);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c2, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c2, m3);
+        c2 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1);
+        c2 = _mm512_mask_permutex2var_ps(c2, 0xe000, idx_c2, m4);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c3, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c3, m3);
+        c3 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1);
+        c3 = _mm512_mask_permutex2var_ps(c3, 0xe000, idx_c3, m4);
+       
+        tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c4, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x0fc0, m2, idx_c4, m3);
+        c4 = _mm512_mask_blend_ps(0x0fc0, tmp0, tmp1);
+        c4 = _mm512_mask_permutex2var_ps(c4, 0xf000, idx_c4, m4);
+       
+        accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512));
+        accum = _mm512_fmadd_ps(c2, x2_512, accum);
+        accum = _mm512_fmadd_ps(c3, x3_512, accum);
+        accum = _mm512_fmadd_ps(c4, x4_512, accum);
+        accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m]));
+        _mm512_storeu_ps(&y[idx_m], accum);
+
+    }
+    if(tag_m_16x !=m) {
+        __m512i idx_c0c2 = _mm512_set_epi32(0, 0, 27, 22, 17, 12, 7, 2 , 0, 30, 25, 20, 15, 10, 5, 0);
+        __m512i idx_c1c3 = _mm512_add_epi32(idx_c0c2, M512_EPI32_1);
+        idx_c4 = _mm512_add_epi32(idx_c1c3, M512_EPI32_1);
+        __m256i idx_c0m4 = _mm256_set_epi32(11, 6, 0, 0, 0, 0, 0, 0);
+        __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
+        __m256i idx_c1m4 = _mm256_add_epi32(idx_c0m4, M256_EPI32_1);
+        __m256i idx_c2m4 = _mm256_add_epi32(idx_c1m4, M256_EPI32_1);
+        __m256i idx_c3m4 = _mm256_add_epi32(idx_c2m4, M256_EPI32_1);
+        __m256i idx_c4m4 = _mm256_add_epi32(idx_c3m4, M256_EPI32_1);
+        //TODO: below can change to use extract to decrease the latency
+        __m256 x0_256 = _mm256_set1_ps(x[0]);
+        __m256 x1_256 = _mm256_set1_ps(x[1]);
+        __m256 x2_256 = _mm256_set1_ps(x[2]);
+        __m256 x3_256 = _mm256_set1_ps(x[3]);
+        __m256 x4_256 = _mm256_set1_ps(x[4]);
+        __m256 alpha256 = _mm256_set1_ps(alpha);
+        __m256 accum_256, m256_4;
+
+        for(BLASLONG idx_m=tag_m_16x; idx_m < tag_m_8x; idx_m+=8) {
+            m0 = _mm512_loadu_ps(&a[idx_m*5]);
+            m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]);
+            m256_4 = _mm256_loadu_ps(&a[idx_m*5 + 32]);
+            tmp0 = _mm512_permutex2var_ps(m0, idx_c0c2, m1);
+            tmp1 = _mm512_permutex2var_ps(m0, idx_c1c3, m1);
+            tmp2 = _mm512_permutex2var_ps(m0, idx_c4, m1);
+
+            __m256 c256_0 = _mm512_extractf32x8_ps(tmp0, 0);
+            __m256 c256_2 = _mm512_extractf32x8_ps(tmp0, 1);
+            __m256 c256_1 = _mm512_extractf32x8_ps(tmp1, 0);
+            __m256 c256_3 = _mm512_extractf32x8_ps(tmp1, 1);
+            __m256 c256_4 = _mm512_extractf32x8_ps(tmp2, 1);
+
+            c256_0 = _mm256_mask_permutex2var_ps(c256_0, 0x80, idx_c0m4, m256_4);
+            c256_1 = _mm256_mask_permutex2var_ps(c256_1, 0x80, idx_c1m4, m256_4);
+            c256_2 = _mm256_mask_permutex2var_ps(c256_2, 0xc0, idx_c2m4, m256_4);
+            c256_3 = _mm256_mask_permutex2var_ps(c256_3, 0xc0, idx_c3m4, m256_4);
+            c256_4 = _mm256_mask_permutex2var_ps(c256_4, 0xc0, idx_c4m4, m256_4);
+            
+            accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256));
+            accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m]));
+            _mm256_storeu_ps(&y[idx_m], accum_256);
+        }
+        if(tag_m_8x != m) {
+            __m256i idx_c02 = _mm256_set_epi32(17, 12, 7, 2, 15, 10, 5, 0);
+            __m256i idx_c13 = _mm256_add_epi32(idx_c02, M256_EPI32_1);
+            __m256i idx_4 = _mm256_add_epi32(idx_c13, M256_EPI32_1); 
+            __m128 accum_128;
+            __m256 m256_0, m256_1, tmp256_0, tmp256_1;
+            for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){
+                m256_0 = _mm256_loadu_ps(&a[idx_m*5]);
+                m256_1 = _mm256_loadu_ps(&a[idx_m*5 + 8]);
+                __m128 m128_4 = _mm_maskz_loadu_ps(0x0f, &a[idx_m*5 + 16]);
+
+                tmp256_0 = _mm256_permutex2var_ps(m256_0, idx_c02, m256_1);
+                tmp256_1 = _mm256_permutex2var_ps(m256_0, idx_c13, m256_1);
+                __m256 tmp256_2 = _mm256_maskz_permutex2var_ps(0xf0, m256_0, idx_4, m256_1);
+
+                __m128 c128_0 = _mm256_extractf32x4_ps(tmp256_0, 0);
+                __m128 c128_1 = _mm256_extractf32x4_ps(tmp256_1, 0);
+                __m128 c128_2 = _mm256_extractf32x4_ps(tmp256_0, 1);
+                __m128 c128_3 = _mm256_extractf32x4_ps(tmp256_1, 1);
+                __m128 c128_4 = _mm256_extractf32x4_ps(tmp256_2, 1);
+
+                __m128i idx_c14 = _mm_set_epi32(4, 0, 0, 0);
+                __m128i M128_EPI32_1 = _mm_set1_epi32(1);
+                __m128i idx_c24 = _mm_add_epi32(idx_c14, M128_EPI32_1);
+                __m128i idx_c34 = _mm_add_epi32(idx_c24, M128_EPI32_1);
+                __m128i idx_c44 = _mm_add_epi32(idx_c34, M128_EPI32_1);
+
+                c128_1 = _mm_mask_permutex2var_ps(c128_1, 0x08, idx_c14, m128_4);
+                c128_2 = _mm_mask_permutex2var_ps(c128_2, 0x08, idx_c24, m128_4);
+                c128_3 = _mm_mask_permutex2var_ps(c128_3, 0x08, idx_c34, m128_4);
+                c128_4 = _mm_mask_permutex2var_ps(c128_4, 0x08, idx_c44, m128_4);
+
+                __m128 x128_0 = _mm256_extractf32x4_ps(x0_256, 0);
+                __m128 x128_1 = _mm256_extractf32x4_ps(x1_256, 0);
+                __m128 x128_2 = _mm256_extractf32x4_ps(x2_256, 0);
+                __m128 x128_3 = _mm256_extractf32x4_ps(x3_256, 0);
+                __m128 x128_4 = _mm256_extractf32x4_ps(x4_256, 0);
+
+                __m128 alpha_128 = _mm256_extractf32x4_ps(alpha256, 0);
+                accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_0, x128_0));
+                accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_2, x128_2, accum_128);
+                accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, accum_128);
+                accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_4, x128_4, accum_128);
+                accum_128 = _mm_maskz_fmadd_ps(0x0f, accum_128, alpha_128, _mm_maskz_loadu_ps(0x0f, &y[idx_m]));
+                _mm_mask_storeu_ps(&y[idx_m], 0x0f, accum_128);
+
+            }
+
+            if(tag_m_4x !=m ){
+                x0_256 = _mm256_maskz_loadu_ps(0x1f, x);
+                x0_256 = _mm256_mul_ps(x0_256, alpha256);
+                float ret8[8];
+
+                for(BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2){
+                    m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5]);
+                    m256_1 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5 + 5]);
+
+                    m256_0 = _mm256_mul_ps(m256_0, x0_256);
+                    m256_1 = _mm256_mul_ps(m256_1, x0_256);
+
+                    _mm256_mask_storeu_ps(ret8, 0x1f, m256_0);
+                    y[idx_m] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4];
+                    _mm256_mask_storeu_ps(ret8, 0x1f, m256_1);
+                    y[idx_m+1] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4];                    
+
+                }
+
+                if(tag_m_2x != m){
+                    m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[tag_m_2x*5]);
+                    m256_0 = _mm256_mul_ps(m256_0, x0_256);
+
+
+                    _mm256_mask_storeu_ps(ret8, 0x1f, m256_0);
+                    y[tag_m_2x] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4];
+
+                }
+            }
+        }
+
+    }
+    return 0;
+}
+
+static int sgemv_kernel_t_6(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    BLASLONG tag_m_16x = m & (~15);
+    BLASLONG tag_m_8x = m & (~7);
+    BLASLONG tag_m_4x = m & (~3);
+    BLASLONG tag_m_2x = m & (~1);
+
+    __m512 m0, m1, m2, m3, m4, m5, c0, c1, c2, c3, c4, c5, tmp0, tmp1, tmp2, accum;
+    __m512i idx_c0 = _mm512_set_epi32(26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0);
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i M512_EPI32_0 = _mm512_setzero_epi32();
+    __m512i M512_EPI32_16 = _mm512_set1_epi32(16);
+    __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1);
+    __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1);
+    idx_c2 = _mm512_mask_blend_epi32(0x0020, idx_c2, M512_EPI32_0);
+    __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1);
+    __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1);
+    idx_c4 = _mm512_mask_blend_epi32(0x0400, idx_c4, M512_EPI32_0);
+    __m512i idx_c5 = _mm512_add_epi32(idx_c4, M512_EPI32_1);
+
+    __m512 x0_512 = _mm512_set1_ps(x[0]);
+    __m512 x1_512 = _mm512_set1_ps(x[1]);
+    __m512 x2_512 = _mm512_set1_ps(x[2]);
+    __m512 x3_512 = _mm512_set1_ps(x[3]);
+    __m512 x4_512 = _mm512_set1_ps(x[4]);
+    __m512 x5_512 = _mm512_set1_ps(x[5]);
+    __m512 alpha_512 = _mm512_set1_ps(alpha);
+
+    for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) {      
+        m0 = _mm512_loadu_ps(&a[idx_m*6]);
+        m1 = _mm512_loadu_ps(&a[idx_m*6 + 16]);
+        m2 = _mm512_loadu_ps(&a[idx_m*6 + 32]);
+        m3 = _mm512_loadu_ps(&a[idx_m*6 + 48]);
+        m4 = _mm512_loadu_ps(&a[idx_m*6 + 64]);
+        m5 = _mm512_loadu_ps(&a[idx_m*6 + 80]);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c0, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c0, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c0, m5);
+        c0 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1);
+        c0 = _mm512_mask_blend_ps(0xf800, c0, tmp2);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c1, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c1, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c1, m5);
+        c1 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1);
+        c1 = _mm512_mask_blend_ps(0xf800, c1, tmp2);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c2, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c2, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c2, m5);
+        c2 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1);
+        c2 = _mm512_mask_blend_ps(0xf800, c2, tmp2);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c3, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c3, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c3, m5);
+        c3 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1);
+        c3 = _mm512_mask_blend_ps(0xf800, c3, tmp2);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c4, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c4, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c4, m5);
+        c4 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1);
+        c4 = _mm512_mask_blend_ps(0xfc00, c4, tmp2);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c5 , m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c5 , m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c5 , m5);
+        c5 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1);
+        c5 = _mm512_mask_blend_ps(0xfc00, c5, tmp2);
+
+        accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512));
+        accum = _mm512_fmadd_ps(c2, x2_512, accum);
+        accum = _mm512_fmadd_ps(c3, x3_512, accum);
+        accum = _mm512_fmadd_ps(c4, x4_512, accum);
+        accum = _mm512_fmadd_ps(c5, x5_512, accum);
+        accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m]));
+        _mm512_storeu_ps(&y[idx_m], accum);
+    }
+
+    if(tag_m_16x != m) {
+        __m512i idx_c0c3 = _mm512_set_epi32(29, 23, 17, 27, 21, 15, 9, 3, 26, 20, 30, 24, 18, 12, 6, 0);
+        __m512i idx_c1c4 = _mm512_add_epi32(idx_c0c3, M512_EPI32_1);
+        __m512i idx_c2c5 = _mm512_add_epi32(idx_c1c4, M512_EPI32_1);
+        idx_c2c5 = _mm512_mask_blend_epi32(0x0020, idx_c2c5, M512_EPI32_16);
+        __m256 c256_0, c256_1, c256_2, c256_3, c256_4, c256_5;
+
+        __m256 x0_256 = _mm256_set1_ps(x[0]);
+        __m256 x1_256 = _mm256_set1_ps(x[1]);
+        __m256 x2_256 = _mm256_set1_ps(x[2]);
+        __m256 x3_256 = _mm256_set1_ps(x[3]);
+        __m256 x4_256 = _mm256_set1_ps(x[4]);
+        __m256 x5_256 = _mm256_set1_ps(x[5]);
+        __m256 alpha256 = _mm256_set1_ps(alpha);
+        __m256 accum_256;
+
+        for(BLASLONG idx_m = tag_m_16x; idx_m <tag_m_8x; idx_m+=8){
+            m0 = _mm512_loadu_ps(&a[idx_m*6]);
+            m1 = _mm512_loadu_ps(&a[idx_m*6 + 16]);
+            m2 = _mm512_loadu_ps(&a[idx_m*6 + 32]);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x1f3f, m0, idx_c0c3 , m1);
+            tmp1 = _mm512_mask_permutex2var_ps(tmp0, 0xe0c0, idx_c0c3 , m2);
+            c256_0 = _mm512_extractf32x8_ps(tmp1, 0);
+            c256_3 = _mm512_extractf32x8_ps(tmp1, 1);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x1f3f, m0, idx_c1c4 , m1);
+            tmp1 = _mm512_mask_permutex2var_ps(tmp0, 0xe0c0, idx_c1c4 , m2);
+            c256_1 = _mm512_extractf32x8_ps(tmp1, 0);
+            c256_4 = _mm512_extractf32x8_ps(tmp1, 1);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x1f1f, m0, idx_c2c5 , m1);
+            tmp1 = _mm512_mask_permutex2var_ps(tmp0, 0xe0e0, idx_c2c5 , m2);
+            c256_2 = _mm512_extractf32x8_ps(tmp1, 0);
+            c256_5 = _mm512_extractf32x8_ps(tmp1, 1);            
+
+            accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256));
+            accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_5, x5_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m]));
+            _mm256_storeu_ps(&y[idx_m], accum_256);
+        }
+
+        if(tag_m_8x != m) {
+            __m256 m256_0, m256_1, m256_2;
+            __m128 c128_0, c128_1;
+            idx_c0 = _mm512_set_epi32(22, 16, 10, 4, 19, 13, 7, 1, 21, 15, 9, 3, 18, 12, 6, 0);
+            idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1);
+            m2 = _mm512_set_ps(x[4], x[4], x[4], x[4], x[1], x[1], x[1], x[1], x[3], x[3], x[3], x[3], x[0], x[0], x[0], x[0]);
+            m3 = _mm512_set_ps(x[5], x[5], x[5], x[5], x[2], x[2], x[2], x[2], 0, 0, 0, 0, 0, 0, 0, 0);
+            for(BLASLONG idx_m=tag_m_8x; idx_m < tag_m_4x; idx_m+=4) {
+                m0 = _mm512_loadu_ps(&a[idx_m*6]);
+                m1 = _mm512_maskz_loadu_ps(0x00ff, &a[idx_m*6+16]);
+
+                tmp0 = _mm512_permutex2var_ps(m0, idx_c0, m1);
+                tmp1 = _mm512_maskz_permutex2var_ps(0xff00, m0, idx_c1, m1);
+
+                tmp0 = _mm512_mul_ps(tmp0, m2);
+                tmp1 = _mm512_mul_ps(tmp1, m3);
+                
+                tmp0 = _mm512_add_ps(tmp0, tmp1);
+
+                m256_0 = _mm512_extractf32x8_ps(tmp0, 0);
+                m256_1 = _mm512_extractf32x8_ps(tmp0, 1);
+
+                m256_0 = _mm256_add_ps(m256_0, m256_1);
+                m256_0 = _mm256_mul_ps(m256_0, alpha256);
+                c128_0 = _mm256_extractf32x4_ps(m256_0, 0);
+                c128_1 = _mm256_extractf32x4_ps(m256_0, 1);
+
+                c128_0 = _mm_maskz_add_ps(0x0f, c128_0, c128_1);
+                c128_0 = _mm_maskz_add_ps(0x0f, c128_0, _mm_maskz_loadu_ps(0x0f, &y[idx_m]));
+                _mm_mask_storeu_ps(&y[idx_m], 0x0f, c128_0);
+            }
+
+            if(tag_m_4x != m) {
+                //m256_2 is x*alpha
+                m256_2 = _mm256_maskz_loadu_ps(0x3f, x);
+                m256_2 = _mm256_mul_ps(m256_2, alpha256);
+                float ret8[8];
+                for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x;idx_m+=2) {
+                    m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*6]);
+                    m256_1 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*6 + 6]);
+                    
+                    m256_0 = _mm256_mul_ps(m256_0, m256_2);
+                    m256_1 = _mm256_mul_ps(m256_1, m256_2);
+
+                    _mm256_storeu_ps(ret8, m256_0);
+                    for(int i=0; i<6;i++){
+                        y[idx_m]+=ret8[i];
+                    }
+
+                    _mm256_storeu_ps(ret8, m256_1);
+                    for(int i=0; i<6;i++){
+                        y[idx_m+1]+=ret8[i];
+                    }
+                }
+                
+                if(tag_m_2x !=m) {
+                    m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[tag_m_2x*6]);
+                    m256_0 = _mm256_mul_ps(m256_0, m256_2);
+                    
+                    _mm256_storeu_ps(ret8, m256_0);
+                    for(int i=0; i<6;i++){
+                        y[tag_m_2x]+=ret8[i];
+                    }
+                }
+            }
+        }
+    }
+    
+    return 0;
+}
+
+static int sgemv_kernel_t_7(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    BLASLONG tag_m_16x = m & (~15);
+    BLASLONG tag_m_8x = m & (~7);
+    BLASLONG tag_m_4x = m & (~3);
+    BLASLONG tag_m_2x = m & (~1);
+
+    __m512 m0, m1, m2, m3, m4, m5, m6, tmp0, tmp1, tmp2, c0, c1, c2, c3, c4, c5, c6, accum;
+    __m512i idx_c0 = _mm512_set_epi32(25, 18, 27, 20, 13, 6, 31, 24, 17, 10, 3, 28, 21, 14, 7, 0);
+
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i M512_EPI32_0 = _mm512_setzero_epi32();
+    __m512i M512_EPI32_16 = _mm512_set1_epi32(16);
+
+    __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1);
+    idx_c1 = _mm512_mask_blend_epi32(0x0200, idx_c1, M512_EPI32_0);
+    __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1);
+    __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1);
+    __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1);
+    idx_c4 = _mm512_mask_blend_epi32(0x0010, idx_c4, M512_EPI32_0);
+    __m512i idx_c5 = _mm512_add_epi32(idx_c4, M512_EPI32_1);
+    idx_c5 = _mm512_mask_blend_epi32(0x2000, idx_c5, M512_EPI32_16);
+    __m512i idx_c6 = _mm512_add_epi32(idx_c5, M512_EPI32_1);
+
+    __m512 x0_512 = _mm512_set1_ps(x[0]);
+    __m512 x1_512 = _mm512_set1_ps(x[1]);
+    __m512 x2_512 = _mm512_set1_ps(x[2]);
+    __m512 x3_512 = _mm512_set1_ps(x[3]);
+    __m512 x4_512 = _mm512_set1_ps(x[4]);
+    __m512 x5_512 = _mm512_set1_ps(x[5]);
+    __m512 x6_512 = _mm512_set1_ps(x[6]);
+    __m512 alpha_512 = _mm512_set1_ps(alpha);
+
+    for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) {
+        m0 = _mm512_loadu_ps(&a[idx_m*7]);
+        m1 = _mm512_loadu_ps(&a[idx_m*7 + 16]);
+        m2 = _mm512_loadu_ps(&a[idx_m*7 + 32]);
+        m3 = _mm512_loadu_ps(&a[idx_m*7 + 48]);
+        m4 = _mm512_loadu_ps(&a[idx_m*7 + 64]);
+        m5 = _mm512_loadu_ps(&a[idx_m*7 + 80]);
+        m6 = _mm512_loadu_ps(&a[idx_m*7 + 96]);
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c0, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c0, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x3c00, m4, idx_c0, m5);
+        c0 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1);
+        c0 = _mm512_mask_blend_ps(0x3c00, c0, tmp2);        
+        c0 = _mm512_mask_permutex2var_ps(c0, 0xc000, idx_c0, m6);        
+        
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c1, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x01e0, m2, idx_c1, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x3e00, m4, idx_c1, m5);
+        c1 = _mm512_mask_blend_ps(0x01e0, tmp0, tmp1);
+        c1 = _mm512_mask_blend_ps(0x3e00, c1, tmp2);        
+        c1 = _mm512_mask_permutex2var_ps(c1, 0xc000, idx_c1, m6); 
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c2, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x01e0, m2, idx_c2, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x3e00, m4, idx_c2, m5);
+        c2 = _mm512_mask_blend_ps(0x01e0, tmp0, tmp1);
+        c2 = _mm512_mask_blend_ps(0x3e00, c2, tmp2);        
+        c2 = _mm512_mask_permutex2var_ps(c2, 0xc000, idx_c2, m6); 
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c3, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x01e0, m2, idx_c3, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x3e00, m4, idx_c3, m5);
+        c3 = _mm512_mask_blend_ps(0x01e0, tmp0, tmp1);
+        c3 = _mm512_mask_blend_ps(0x3e00, c3, tmp2);        
+        c3 = _mm512_mask_permutex2var_ps(c3, 0xc000, idx_c3, m6); 
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x000f, m0, idx_c4, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x01f0, m2, idx_c4, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x3e00, m4, idx_c4, m5);
+        c4 = _mm512_mask_blend_ps(0x01f0, tmp0, tmp1);
+        c4 = _mm512_mask_blend_ps(0x3e00, c4, tmp2);        
+        c4 = _mm512_mask_permutex2var_ps(c4, 0xc000, idx_c4, m6); 
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x000f, m0, idx_c5, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x01f0, m2, idx_c5, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x1e00, m4, idx_c5, m5);
+        c5 = _mm512_mask_blend_ps(0x01f0, tmp0, tmp1);
+        c5 = _mm512_mask_blend_ps(0x1e00, c5, tmp2);        
+        c5 = _mm512_mask_permutex2var_ps(c5, 0xe000, idx_c5, m6); 
+
+        tmp0 = _mm512_maskz_permutex2var_ps(0x000f, m0, idx_c6, m1);
+        tmp1 = _mm512_maskz_permutex2var_ps(0x01f0, m2, idx_c6, m3);
+        tmp2 = _mm512_maskz_permutex2var_ps(0x1e00, m4, idx_c6, m5);
+        c6 = _mm512_mask_blend_ps(0x01f0, tmp0, tmp1);
+        c6 = _mm512_mask_blend_ps(0x1e00, c6, tmp2);        
+        c6 = _mm512_mask_permutex2var_ps(c6, 0xe000, idx_c6, m6); 
+
+        accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512));
+        accum = _mm512_fmadd_ps(c2, x2_512, accum);
+        accum = _mm512_fmadd_ps(c3, x3_512, accum);
+        accum = _mm512_fmadd_ps(c4, x4_512, accum);
+        accum = _mm512_fmadd_ps(c5, x5_512, accum);
+        accum = _mm512_fmadd_ps(c6, x6_512, accum);
+        accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m]));
+        _mm512_storeu_ps(&y[idx_m], accum);
+
+    }
+
+    if(tag_m_16x != m){
+        //this is idx of c0c3
+        idx_c0 = _mm512_set_epi32(20, 13, 6, 31, 24, 17, 10, 3, 17, 10, 3, 28, 21, 14, 7, 0);
+        //this is idx of c1c4
+        idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1);
+        idx_c1 = _mm512_mask_blend_epi32(0x1000, idx_c1, M512_EPI32_0);
+        //this is idx of c2c5
+        idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1);
+        idx_c6 = _mm512_add_epi32(idx_c2, M512_EPI32_1);
+        __m256 c256_0, c256_1, c256_2, c256_3, c256_4, c256_5, c256_6;
+        __m256 x0_256 = _mm256_set1_ps(x[0]);
+        __m256 x1_256 = _mm256_set1_ps(x[1]);
+        __m256 x2_256 = _mm256_set1_ps(x[2]);
+        __m256 x3_256 = _mm256_set1_ps(x[3]);
+        __m256 x4_256 = _mm256_set1_ps(x[4]);
+        __m256 x5_256 = _mm256_set1_ps(x[5]);
+        __m256 x6_256 = _mm256_set1_ps(x[6]);
+        __m256 alpha256 = _mm256_set1_ps(alpha);
+        __m256 accum_256;
+        for (BLASLONG idx_m=tag_m_16x; idx_m < tag_m_8x; idx_m+=8) {
+            m0 = _mm512_loadu_ps(&a[idx_m*7]);
+            m1 = _mm512_loadu_ps(&a[idx_m*7 + 16]);
+            m2 = _mm512_loadu_ps(&a[idx_m*7 + 32]);
+            m3 = _mm512_maskz_loadu_ps(0x00ff, &a[idx_m*7 + 48]);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x1f1f, m0, idx_c0, m1);
+            tmp1 = _mm512_maskz_permutex2var_ps(0xe0e0, m2, idx_c0, m3);
+            //this is c0c3
+            c0 = _mm512_mask_blend_ps(0xe0e0, tmp0, tmp1);
+            c256_0 = _mm512_extractf32x8_ps(c0, 0);
+            c256_3 = _mm512_extractf32x8_ps(c0, 1);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x0f1f, m0, idx_c1, m1);
+            tmp1 = _mm512_maskz_permutex2var_ps(0xf0e0, m2, idx_c1, m3);
+            //this is c1c4
+            c1 = _mm512_mask_blend_ps(0xf0e0, tmp0, tmp1);
+            c256_1 = _mm512_extractf32x8_ps(c1, 0);
+            c256_4 = _mm512_extractf32x8_ps(c1, 1);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x0f1f, m0, idx_c2, m1);
+            tmp1 = _mm512_maskz_permutex2var_ps(0xf0e0, m2, idx_c2, m3);
+            //this is c2c5
+            c2 = _mm512_mask_blend_ps(0xf0e0, tmp0, tmp1);
+            c256_2 = _mm512_extractf32x8_ps(c2, 0);
+            c256_5 = _mm512_extractf32x8_ps(c2, 1);
+
+            tmp0 = _mm512_maskz_permutex2var_ps(0x0f00, m0, idx_c6, m1);
+            tmp1 = _mm512_maskz_permutex2var_ps(0xf000, m2, idx_c6, m3);
+            //this is c2c5
+            c6 = _mm512_mask_blend_ps(0xf000, tmp0, tmp1);  
+            c256_6 = _mm512_extractf32x8_ps(c6, 1);
+            accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256));
+            accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_5, x5_256, accum_256);
+            accum_256 = _mm256_fmadd_ps(c256_6, x6_256, accum_256);
+
+            accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m]));
+            _mm256_storeu_ps(&y[idx_m], accum_256);
+        }
+
+        if(tag_m_8x!=m) {
+            idx_c0 = _mm512_set_epi32(27, 20, 13, 6, 25, 18, 11, 4, 23, 16, 9, 2 ,21, 14, 7, 0);
+            idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1);
+
+            for (BLASLONG idx_m=tag_m_8x; idx_m < tag_m_4x; idx_m+=4) {
+                m0 = _mm512_loadu_ps(&a[idx_m*7]);
+                m1 = _mm512_maskz_loadu_ps(0x0fff, &a[idx_m*7 + 16]);
+                //this is x
+                m2 = _mm512_set_ps(x[6], x[6], x[6], x[6], x[4], x[4], x[4], x[4], x[2], x[2], x[2], x[2], x[0], x[0], x[0], x[0]);
+                //this is x
+                m4 = _mm512_set_ps(0, 0, 0, 0, x[5], x[5], x[5], x[5], x[3], x[3], x[3], x[3], x[1], x[1], x[1], x[1]);
+
+                tmp0 = _mm512_permutex2var_ps(m0, idx_c0, m1);
+                tmp1 = _mm512_maskz_permutex2var_ps(0x0fff, m0, idx_c1, m1);
+
+                tmp0 = _mm512_mul_ps(tmp0, m2);
+                tmp1 = _mm512_mul_ps(tmp1, m4);
+
+                tmp0 = _mm512_add_ps(tmp0, tmp1);
+
+                c256_0 = _mm512_extractf32x8_ps(tmp0, 0);
+                c256_1 = _mm512_extractf32x8_ps(tmp0, 1);
+
+                c256_0 = _mm256_add_ps(c256_0, c256_1);
+                c256_0 = _mm256_mul_ps(c256_0, alpha256);
+
+                __m128 c128_0 = _mm256_extractf32x4_ps(c256_0, 0);
+                __m128 c128_1 = _mm256_extractf32x4_ps(c256_0, 1);
+
+                c128_0 = _mm_maskz_add_ps(0x0f, c128_0, c128_1);
+                c128_0 = _mm_maskz_add_ps(0x0f, c128_0, _mm_maskz_loadu_ps(0x0f, &y[idx_m]));
+                _mm_mask_storeu_ps(&y[idx_m], 0x0f, c128_0);
+
+            }
+
+            if(tag_m_4x != m) {
+                //c256_2 is x*alpha
+                c256_2 = _mm256_maskz_loadu_ps(0x7f, x);
+                c256_2 = _mm256_mul_ps(c256_2, alpha256);
+                float ret8[8];
+                for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x;idx_m+=2) {
+                    c256_0 = _mm256_maskz_loadu_ps(0x7f, &a[idx_m*7]);
+                    c256_1 = _mm256_maskz_loadu_ps(0x7f, &a[idx_m*7 + 7]);
+                    
+                    c256_0 = _mm256_mul_ps(c256_0, c256_2);
+                    c256_1 = _mm256_mul_ps(c256_1, c256_2);
+
+                    _mm256_storeu_ps(ret8, c256_0);
+                    for(int i=0; i<7;i++){
+                        y[idx_m]+=ret8[i];
+                    }
+
+                    _mm256_storeu_ps(ret8, c256_1);
+                    for(int i=0; i<7;i++){
+                        y[idx_m+1]+=ret8[i];
+                    }
+                }
+                
+                if(tag_m_2x !=m) {
+                    c256_0 = _mm256_maskz_loadu_ps(0x7f, &a[tag_m_2x*7]);
+                    c256_0 = _mm256_mul_ps(c256_0, c256_2);
+                    
+                    _mm256_storeu_ps(ret8, c256_0);
+                    for(int i=0; i<7;i++){
+                        y[tag_m_2x]+=ret8[i];
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float *y)
+{
+    BLASLONG tag_m_8x = m & (~7);
+    BLASLONG tag_m_4x = m & (~3);
+    BLASLONG tag_m_2x = m & (~1);
+
+    __m512 m0, m1, m2, m3;
+    __m256 r0, r1, r2, r3, r4, r5, r6, r7, tmp0, tmp1, tmp2, tmp3;
+    __m128 c128_0, c128_1, c128_2, c128_3;
+    __m256 alpha256 = _mm256_set1_ps(alpha);
+
+    __m256 x256 = _mm256_loadu_ps(x);
+    x256 = _mm256_mul_ps(x256, alpha256);
+    __m512 x512 = _mm512_broadcast_f32x8(x256);
+
+    for(BLASLONG idx_m=0; idx_m<tag_m_8x; idx_m+=8) {
+        m0 = _mm512_loadu_ps(&a[idx_m*8]);
+        m1 = _mm512_loadu_ps(&a[idx_m*8 + 16]);
+        m2 = _mm512_loadu_ps(&a[idx_m*8 + 32]);
+        m3 = _mm512_loadu_ps(&a[idx_m*8 + 48]);
+        m0 = _mm512_mul_ps(m0, x512);
+        m1 = _mm512_mul_ps(m1, x512);
+        m2 = _mm512_mul_ps(m2, x512);
+        m3 = _mm512_mul_ps(m3, x512);
+
+        r0 = _mm512_extractf32x8_ps(m0, 0);
+        r1 = _mm512_extractf32x8_ps(m0, 1);
+        r2 = _mm512_extractf32x8_ps(m1, 0);
+        r3 = _mm512_extractf32x8_ps(m1, 1);
+        r4 = _mm512_extractf32x8_ps(m2, 0);
+        r5 = _mm512_extractf32x8_ps(m2, 1);
+        r6 = _mm512_extractf32x8_ps(m3, 0);
+        r7 = _mm512_extractf32x8_ps(m3, 1);
+
+        tmp0 = _mm256_hadd_ps(r0, r1);
+        tmp1 = _mm256_hadd_ps(r2, r3);
+        tmp2 = _mm256_hadd_ps(r4, r5);
+        tmp3 = _mm256_hadd_ps(r6, r7);
+        tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+        tmp3 = _mm256_hadd_ps(tmp2, tmp3);
+        c128_0 = _mm256_extractf32x4_ps(tmp1, 0);
+        c128_1 = _mm256_extractf32x4_ps(tmp1, 1);
+        c128_2 = _mm256_extractf32x4_ps(tmp3, 0);
+        c128_3 = _mm256_extractf32x4_ps(tmp3, 1);
+
+        c128_0 = _mm_add_ps(c128_0, c128_1);
+        c128_2 = _mm_add_ps(c128_2, c128_3);
+        _mm_storeu_ps(&y[idx_m], _mm_add_ps(c128_0, _mm_loadu_ps(&y[idx_m])));
+        _mm_storeu_ps(&y[idx_m+4], _mm_add_ps(c128_2, _mm_loadu_ps(&y[idx_m+4])));
+    }
+
+    if (tag_m_8x !=m ){
+        for(BLASLONG idx_m=tag_m_8x; idx_m<tag_m_4x; idx_m+=4) {
+            m0 = _mm512_loadu_ps(&a[idx_m*8]);
+            m1 = _mm512_loadu_ps(&a[idx_m*8 + 16]);
+
+            m0 = _mm512_mul_ps(m0, x512);
+            m1 = _mm512_mul_ps(m1, x512);
+
+            r0 = _mm512_extractf32x8_ps(m0, 0);
+            r1 = _mm512_extractf32x8_ps(m0, 1);
+            r2 = _mm512_extractf32x8_ps(m1, 0);
+            r3 = _mm512_extractf32x8_ps(m1, 1);
+
+            tmp0 = _mm256_hadd_ps(r0, r1);
+            tmp1 = _mm256_hadd_ps(r2, r3);
+
+            tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+            c128_0 = _mm256_extractf32x4_ps(tmp1, 0);
+            c128_1 = _mm256_extractf32x4_ps(tmp1, 1);
+
+            c128_0 = _mm_add_ps(c128_0, c128_1);
+            _mm_storeu_ps(&y[idx_m], _mm_add_ps(c128_0, _mm_loadu_ps(&y[idx_m])));
+
+        }
+
+        if(tag_m_4x != m) {
+            float ret[4];
+            for(BLASLONG idx_m=tag_m_4x; idx_m<tag_m_2x; idx_m+=2) {
+                m0 = _mm512_loadu_ps(&a[idx_m*8]);
+                m0 = _mm512_mul_ps(m0, x512);
+                r0 = _mm512_extractf32x8_ps(m0, 0);
+                r1 = _mm512_extractf32x8_ps(m0, 1);
+                tmp0 = _mm256_hadd_ps(r0, r1);
+                c128_0 = _mm256_extractf32x4_ps(tmp0, 0);
+                c128_1 = _mm256_extractf32x4_ps(tmp0, 1);                
+
+                c128_0 = _mm_add_ps(c128_0, c128_1);
+
+                _mm_storeu_ps(ret, c128_0);
+                y[idx_m] += (ret[0]+ret[1]);
+                y[idx_m+1] += (ret[2]+ret[3]);
+
+            }
+
+            if(tag_m_2x!=m) {
+                r0 = _mm256_loadu_ps(&a[tag_m_2x*8]);
+                r0 = _mm256_mul_ps(r0, x256);
+
+                c128_0 = _mm256_extractf32x4_ps(r0, 0);
+                c128_1 = _mm256_extractf32x4_ps(r0, 1);   
+
+                c128_0 = _mm_add_ps(c128_0, c128_1);
+                _mm_storeu_ps(ret, c128_0);
+                y[tag_m_2x] += (ret[0] + ret[1] + ret[2] + ret[3]);
+
+            }
+        }
+    }
+    return 0;
+}
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index 3de586cb8..a49544616 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -2,7 +2,7 @@
 
 #if defined(SKYLAKEX)
 #include "srot_microk_skylakex-2.c"
-#elif defined(HASWELL)
+#elif defined(HASWELL) || defined(ZEN)
 #include "srot_microk_haswell-2.c"
 #endif
 
@@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
     
-#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128)
+#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128)
     const int vstep = v_nlanes_f32;
     const int unrollx4 = n & (-vstep * 4);
     const int unrollx = n & -vstep;
@@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 #else
 	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
 #endif
-	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads);
     }
 #else	
     rot_compute(n, x, inc_x, y, inc_y, c, s);
diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c
index 8e245cc8f..b5545726e 100644
--- a/kernel/x86_64/srot_microk_haswell-2.c
+++ b/kernel/x86_64/srot_microk_haswell-2.c
@@ -1,5 +1,4 @@
-/* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#if defined(HAVE_FMA3)  && defined(HAVE_AVX2)
 
 #define HAVE_SROT_KERNEL 1
 
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index c9d698eb7..29d6a9958 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ssymv_L_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
 #include "ssymv_L_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "ssymv_L_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ssymv_L_microk_sandy-2.c"
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index 4d8aac1ab..02bbc1c64 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ssymv_U_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
 #include "ssymv_U_microk_nehalem-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "ssymv_U_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "ssymv_U_microk_sandy-2.c"
diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S
index fea4fc746..55780734f 100644
--- a/kernel/x86_64/symv_L_sse.S
+++ b/kernel/x86_64/symv_L_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S
index b853ef365..77331d95f 100644
--- a/kernel/x86_64/symv_L_sse2.S
+++ b/kernel/x86_64/symv_L_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S
index bad367e91..b61182303 100644
--- a/kernel/x86_64/symv_U_sse.S
+++ b/kernel/x86_64/symv_U_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 12)
diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S
index 147201751..99bc07d50 100644
--- a/kernel/x86_64/symv_U_sse2.S
+++ b/kernel/x86_64/symv_U_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 12)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c
index 3d1796621..a88fdcc2e 100644
--- a/kernel/x86_64/tobf16.c
+++ b/kernel/x86_64/tobf16.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
-#if defined(COOPERLAKE)
+#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #if defined(DOUBLE)
 #include "dtobf16_microk_cooperlake.c"
 #elif defined(SINGLE)
diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
index 6e758e2e3..80e95a2c8 100644
--- a/kernel/x86_64/zasum.c
+++ b/kernel/x86_64/zasum.c
@@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         mode = BLAS_DOUBLE | BLAS_COMPLEX;
 #endif
         blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
-                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+                NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
         ptr = (FLOAT *)result;
         for (i = 0; i < nthreads; i++) {
             sumf += (*ptr);
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index b44c53801..e257a5456 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
 
     if (n2 < 32) {
         __m128d accum_10, accum_11, accum_12, accum_13;
-        __m128d abs_mask1;
+        __m128d abs_mask1 = abs_mask1;
 
         accum_10 = _mm_setzero_pd();
         accum_11 = _mm_setzero_pd();
diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c
index 25e9f6d42..8786870bd 100644
--- a/kernel/x86_64/zaxpy.c
+++ b/kernel/x86_64/zaxpy.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zaxpy_microk_bulldozer-2.c"
 #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
 #include "zaxpy_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zaxpy_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "zaxpy_microk_sandy-2.c"
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 1bc785ac1..c52575d07 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zdot_microk_bulldozer-2.c"
 #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR)
 #include "zdot_microk_steamroller-2.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zdot_microk_haswell-2.c"
 #elif defined(SANDYBRIDGE)
 #include "zdot_microk_sandy-2.c"
@@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
 				   x, inc_x, y, inc_y, result, 0,
-				   ( void *)zdot_thread_function, nthreads);
+				   (int (*)(void))zdot_thread_function, nthreads);
 
 		ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
 		for (i = 0; i < nthreads; i++) {
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 1f9d41859..2d6866a78 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zgemv_n_microk_haswell-4.c"
 #elif defined(SANDYBRIDGE)
 #include "zgemv_n_microk_sandy-4.c"
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 34f28b224..c2791e0f3 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "zgemv_t_microk_bulldozer-4.c"
-#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zgemv_t_microk_haswell-4.c"
 #endif
 
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 09a702a81..3744c98bb 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "zscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S
index 83ed41ba1..df190c64c 100644
--- a/kernel/x86_64/zsymv_L_sse.S
+++ b/kernel/x86_64/zsymv_L_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S
index 7ed2faf0f..bfe0cf7ee 100644
--- a/kernel/x86_64/zsymv_L_sse2.S
+++ b/kernel/x86_64/zsymv_L_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
@@ -451,7 +451,6 @@
 #endif
 
 	MOVDDUP(4 * SIZE, A1, a1)
-	MOVDDUP(6 * SIZE, A2, a2)
 
 	movsd	 0 * SIZE(YY), yy1
 	movhpd	 1 * SIZE(YY), yy1
@@ -471,7 +470,9 @@
 	subq	IS, I
 	subq	$2, I
 	sarq	$2, I
-	jle	.L15
+	jle	.L14
+
+	MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2)
 	ALIGN_3
 
 .L12:
@@ -632,6 +633,16 @@
 	jg	 .L12
 	ALIGN_3
 
+.L14:
+	movq     M, I
+	subq    IS, I
+	subq    $2, I
+	testq   $2, I
+	jle     .L16
+
+	MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2)
+	jmp     .L15_pastcheck
+
 .L15:
 	movq	 M, I
 	subq	IS, I
@@ -639,6 +650,7 @@
 	testq	$2, I
 	jle	.L16
 
+.L15_pastcheck:
 	movapd	  xtemp1, xt1
 	mulpd	  a1,     xt1
 	mulpd	  atemp1, a1
diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S
index 5945f3f81..13176ce9c 100644
--- a/kernel/x86_64/zsymv_U_sse.S
+++ b/kernel/x86_64/zsymv_U_sse.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S
index 484d74f14..1657885c0 100644
--- a/kernel/x86_64/zsymv_U_sse2.S
+++ b/kernel/x86_64/zsymv_U_sse2.S
@@ -57,7 +57,7 @@
 #define PREFETCHSIZE	(16 * 24)
 #endif
 
-#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
+#if defined(NEHALEM) || defined(SANDYBRIDGE)  || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #define PREFETCH	prefetcht0
 #define PREFETCHW	prefetcht0
 #define PREFETCHSIZE	(16 * 24)
diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index aedaa308d..ada1944b2 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -566,8 +566,8 @@ void LAPACK_cgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     lapack_complex_float const* AB, lapack_int const* ldab,
     lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    float* R,
-    float* C,
+    const float* R,
+    const float* C,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -585,8 +585,8 @@ void LAPACK_dgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     double const* AB, lapack_int const* ldab,
     double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    double* R,
-    double* C,
+    const double* R,
+    const double* C,
     double const* B, lapack_int const* ldb,
     double* X, lapack_int const* ldx,
     double* rcond,
@@ -604,8 +604,8 @@ void LAPACK_sgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     float const* AB, lapack_int const* ldab,
     float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    float* R,
-    float* C,
+    const float* R,
+    const float* C,
     float const* B, lapack_int const* ldb,
     float* X, lapack_int const* ldx,
     float* rcond,
@@ -623,8 +623,8 @@ void LAPACK_zgbrfsx(
     lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
     lapack_complex_double const* AB, lapack_int const* ldab,
     lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
-    double* R,
-    double* C,
+    const double* R,
+    const double* C,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -2941,6 +2941,42 @@ void LAPACK_zgetsls(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT)
+void LAPACK_cgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT)
+void LAPACK_dgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* ldt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT)
+void LAPACK_sgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* ldt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT)
+void LAPACK_zgetsqrhrt(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
 void LAPACK_cggbak(
     char const* job, char const* side,
@@ -4768,7 +4804,7 @@ void LAPACK_chegst(
     lapack_int const* itype, char const* uplo,
     lapack_int const* n,
     lapack_complex_float* A, lapack_int const* lda,
-    lapack_complex_float* B, lapack_int const* ldb,
+    const lapack_complex_float* B, lapack_int const* ldb,
     lapack_int* info );
 
 #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
@@ -4776,7 +4812,7 @@ void LAPACK_zhegst(
     lapack_int const* itype, char const* uplo,
     lapack_int const* n,
     lapack_complex_double* A, lapack_int const* lda,
-    lapack_complex_double* B, lapack_int const* ldb,
+    const lapack_complex_double* B, lapack_int const* ldb,
     lapack_int* info );
 
 #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
@@ -4913,7 +4949,7 @@ void LAPACK_cherfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_float const* A, lapack_int const* lda,
     lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -4931,7 +4967,7 @@ void LAPACK_zherfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_double const* A, lapack_int const* lda,
     lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -7251,6 +7287,24 @@ void LAPACK_sorgtr(
     float* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW)
+void LAPACK_dorgtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    double* A, lapack_int const* lda,
+    double const* T, lapack_int const* ldt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW)
+void LAPACK_sorgtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    float* A, lapack_int const* lda,
+    float const* T, lapack_int const* ldt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr(
     char const* vect, char const* side, char const* trans,
@@ -8005,7 +8059,7 @@ void LAPACK_cporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_float const* A, lapack_int const* lda,
     lapack_complex_float const* AF, lapack_int const* ldaf,
-    float* S,
+    const float* S,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -8023,7 +8077,7 @@ void LAPACK_dporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     double const* A, lapack_int const* lda,
     double const* AF, lapack_int const* ldaf,
-    double* S,
+    const double* S,
     double const* B, lapack_int const* ldb,
     double* X, lapack_int const* ldx,
     double* rcond,
@@ -8041,7 +8095,7 @@ void LAPACK_sporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     float const* A, lapack_int const* lda,
     float const* AF, lapack_int const* ldaf,
-    float* S,
+    const float* S,
     float const* B, lapack_int const* ldb,
     float* X, lapack_int const* ldx,
     float* rcond,
@@ -8059,7 +8113,7 @@ void LAPACK_zporfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_double const* A, lapack_int const* lda,
     lapack_complex_double const* AF, lapack_int const* ldaf,
-    double* S,
+    const double* S,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -10756,7 +10810,7 @@ void LAPACK_csyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_float const* A, lapack_int const* lda,
     lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
     lapack_complex_float const* B, lapack_int const* ldb,
     lapack_complex_float* X, lapack_int const* ldx,
     float* rcond,
@@ -10774,7 +10828,7 @@ void LAPACK_dsyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     double const* A, lapack_int const* lda,
     double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
     double const* B, lapack_int const* ldb,
     double* X, lapack_int const* ldx,
     double* rcond,
@@ -10792,7 +10846,7 @@ void LAPACK_ssyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     float const* A, lapack_int const* lda,
     float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    float* S,
+    const float* S,
     float const* B, lapack_int const* ldb,
     float* X, lapack_int const* ldx,
     float* rcond,
@@ -10810,7 +10864,7 @@ void LAPACK_zsyrfsx(
     lapack_int const* n, lapack_int const* nrhs,
     lapack_complex_double const* A, lapack_int const* lda,
     lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
-    double* S,
+    const double* S,
     lapack_complex_double const* B, lapack_int const* ldb,
     lapack_complex_double* X, lapack_int const* ldx,
     double* rcond,
@@ -11556,7 +11610,7 @@ void LAPACK_zsytrs(
 void LAPACK_csytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
     lapack_complex_float* B, lapack_int const* ldb,
     lapack_complex_float* work,
     lapack_int* info );
@@ -11565,7 +11619,7 @@ void LAPACK_csytrs2(
 void LAPACK_dsytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    const double* A, lapack_int const* lda, lapack_int const* ipiv,
     double* B, lapack_int const* ldb,
     double* work,
     lapack_int* info );
@@ -11574,7 +11628,7 @@ void LAPACK_dsytrs2(
 void LAPACK_ssytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    const float* A, lapack_int const* lda, lapack_int const* ipiv,
     float* B, lapack_int const* ldb,
     float* work,
     lapack_int* info );
@@ -11583,7 +11637,7 @@ void LAPACK_ssytrs2(
 void LAPACK_zsytrs2(
     char const* uplo,
     lapack_int const* n, lapack_int const* nrhs,
-    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
     lapack_complex_double* B, lapack_int const* ldb,
     lapack_complex_double* work,
     lapack_int* info );
@@ -13540,6 +13594,24 @@ void LAPACK_zungtr(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW)
+void LAPACK_cungtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW)
+void LAPACK_zungtsqr_row(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* mb, lapack_int const* nb,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
 #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr(
     char const* vect, char const* side, char const* trans,
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 012c104bb..5c129db91 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo,
 
 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int lda, const lapack_complex_float* b,
                            lapack_int ldb );
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int lda, const lapack_complex_double* b,
                            lapack_int ldb );
 
 lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
@@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a,
 lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a,
                            lapack_int lda, const double* tau );
 
+lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 float* a, lapack_int lda,
+                                 const float* t, lapack_int ldt );
+lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 double* a, lapack_int lda,
+                                 const double* t, lapack_int ldt );
+
 lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans,
                            lapack_int m, lapack_int n, lapack_int k,
                            const float* a, lapack_int lda, const float* tau,
@@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n,
                            lapack_complex_double* a, lapack_int lda,
                            const lapack_complex_double* tau );
 
+lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* t, lapack_int ldt );
+
 lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans,
                            lapack_int m, lapack_int n, lapack_int k,
                            const lapack_complex_float* a, lapack_int lda,
@@ -6932,11 +6950,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range,
 
 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int lda, const lapack_complex_float* b,
                                 lapack_int ldb );
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int lda, const lapack_complex_double* b,
                                 lapack_int ldb );
 
 lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz,
@@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n,
                                 double* a, lapack_int lda, const double* tau,
                                 double* work, lapack_int lwork );
 
+lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      float* a, lapack_int lda,
+                                      const float* t, lapack_int ldt,
+                                      float* work, lapack_int lwork );
+lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      double* a, lapack_int lda,
+                                      const double* t, lapack_int ldt,
+                                      double* work, lapack_int lwork );
+
 lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side,
                                 char trans, lapack_int m, lapack_int n,
                                 lapack_int k, const float* a, lapack_int lda,
@@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n,
                                 const lapack_complex_double* tau,
                                 lapack_complex_double* work, lapack_int lwork );
 
+lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_float* a, lapack_int lda,
+                                      const lapack_complex_float* t, lapack_int ldt,
+                                      lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout,
+                                      lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_double* a, lapack_int lda,
+                                      const lapack_complex_double* t, lapack_int ldt,
+                                      lapack_complex_double* work, lapack_int lwork );
+
 lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side,
                                 char trans, lapack_int m, lapack_int n,
                                 lapack_int k, const lapack_complex_float* a,
@@ -10553,11 +10597,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv,
                                   lapack_complex_float* work, lapack_int nb );
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_float* a,
+                            lapack_int nrhs, const lapack_complex_float* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_float* b, lapack_int ldb );
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_float* a,
+                                 lapack_int nrhs, const lapack_complex_float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* work );
@@ -10718,10 +10762,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv, double* work,
                                   lapack_int nb );
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, double* a, lapack_int lda,
+                            lapack_int nrhs, const double* a, lapack_int lda,
                             const lapack_int* ipiv, double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, double* a,
+                                 lapack_int nrhs, const double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  double* b, lapack_int ldb, double* work );
 lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2,
@@ -10813,10 +10857,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv, float* work,
                                   lapack_int nb );
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, float* a, lapack_int lda,
+                            lapack_int nrhs, const float* a, lapack_int lda,
                             const lapack_int* ipiv, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, float* a,
+                                 lapack_int nrhs, const float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  float* b, lapack_int ldb, float* work );
 lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2,
@@ -10898,11 +10942,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv,
                                   lapack_complex_double* work, lapack_int nb );
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_double* a,
+                            lapack_int nrhs, const lapack_complex_double* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_double* b, lapack_int ldb );
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_double* a,
+                                 lapack_int nrhs, const lapack_complex_double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work );
@@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work, lapack_int lwork );
 
+lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               float* a, lapack_int lda,
+                               float* t, lapack_int ldt );
+lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               double* a, lapack_int lda,
+                               double* t, lapack_int ldt );
+lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* t, lapack_int ldt );
+lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* t, lapack_int ldt );
+
+lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    float* a, lapack_int lda,
+                                    float* t, lapack_int ldt,
+                                    float* work, lapack_int lwork );
+lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    double* a, lapack_int lda,
+                                    double* t, lapack_int ldt,
+                                    double* work, lapack_int lwork );
+lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_float* a, lapack_int lda,
+                                    lapack_complex_float* t, lapack_int ldt,
+                                    lapack_complex_float* work, lapack_int lwork );
+lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_double* a, lapack_int lda,
+                                    lapack_complex_double* t, lapack_int ldt,
+                                    lapack_complex_double* work, lapack_int lwork );
+
 lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
                           float* a, lapack_int lda, float* w );
 lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n,
diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h
index a9236d23f..ec29f24fc 100644
--- a/lapack-netlib/LAPACKE/include/lapacke_utils.h
+++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h
@@ -67,7 +67,11 @@ extern "C" {
 void LAPACKE_xerbla( const char *name, lapack_int info );
 
 /* Compare two chars (case-insensitive) */
-lapack_logical LAPACKE_lsame( char ca,  char cb );
+lapack_logical LAPACKE_lsame( char ca,  char cb )
+#if defined __GNUC__
+	__attribute__((const))
+#endif
+	;
 
 /* Functions to convert column-major to row-major 2d arrays and vice versa. */
 void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index a602dd7a0..7f827e1c9 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -162,6 +162,8 @@ lapacke_cgetrs.o \
 lapacke_cgetrs_work.o \
 lapacke_cgetsls.o \
 lapacke_cgetsls_work.o \
+lapacke_cgetsqrhrt.o \
+lapacke_cgetsqrhrt_work.o \
 lapacke_cggbak.o \
 lapacke_cggbak_work.o \
 lapacke_cggbal.o \
@@ -634,6 +636,8 @@ lapacke_cungrq.o \
 lapacke_cungrq_work.o \
 lapacke_cungtr.o \
 lapacke_cungtr_work.o \
+lapacke_cungtsqr_row.o \
+lapacke_cungtsqr_row_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@@ -778,6 +782,8 @@ lapacke_dgetrs.o \
 lapacke_dgetrs_work.o \
 lapacke_dgetsls.o \
 lapacke_dgetsls_work.o \
+lapacke_dgetsqrhrt.o \
+lapacke_dgetsqrhrt_work.o \
 lapacke_dggbak.o \
 lapacke_dggbak_work.o \
 lapacke_dggbal.o \
@@ -900,6 +906,8 @@ lapacke_dorgrq.o \
 lapacke_dorgrq_work.o \
 lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
+lapacke_dorgtsqr_row.o \
+lapacke_dorgtsqr_row_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \
 lapacke_sgetrs_work.o \
 lapacke_sgetsls.o \
 lapacke_sgetsls_work.o \
+lapacke_sgetsqrhrt.o \
+lapacke_sgetsqrhrt_work.o \
 lapacke_sggbak.o \
 lapacke_sggbak_work.o \
 lapacke_sggbal.o \
@@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \
 lapacke_sorgrq_work.o \
 lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
+lapacke_sorgtsqr_row.o \
+lapacke_sorgtsqr_row_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \
 lapacke_zgetrs_work.o \
 lapacke_zgetsls.o \
 lapacke_zgetsls_work.o \
+lapacke_zgetsqrhrt.o \
+lapacke_zgetsqrhrt_work.o \
 lapacke_zggbak.o \
 lapacke_zggbak_work.o \
 lapacke_zggbal.o \
@@ -2380,6 +2394,8 @@ lapacke_zungrq.o \
 lapacke_zungrq_work.o \
 lapacke_zungtr.o \
 lapacke_zungtr_work.o \
+lapacke_zungtsqr_row.o \
+lapacke_zungtsqr_row_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
index 558a7f308..4256c0f04 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c
@@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_cgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_cgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
new file mode 100644
index 000000000..0e67e0b83
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c
@@ -0,0 +1,80 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_complex_float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_C2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_float*)
+        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
new file mode 100644
index 000000000..598f193e6
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_float* a, lapack_int lda,
+                                    lapack_complex_float* t, lapack_int ldt,
+                                    lapack_complex_float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
index aa78e678e..dbb2753d1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
index d26c84785..2f25c187a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
index e8f212efb..9e8a1c4db 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else { 
             LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
index ff7dd3532..c628017c2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, lapack_complex_float* b,
+                           lapack_int lda, const lapack_complex_float* b,
                            lapack_int ldb )
 {
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
index a29e01961..001863819 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, lapack_complex_float* b,
+                                lapack_int lda, const lapack_complex_float* b,
                                 lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv.c b/lapack-netlib/LAPACKE/src/lapacke_chegv.c
index 15d052987..c01525662 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
index 537b9450b..fc3395833 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
index 98c901982..fe7b39cee 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
@@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
index 3ba62746e..d56e3ee46 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
index 6937752c4..fc0d4e3d2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c
@@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
index 80d262626..eba359312 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
index 8c4c21935..4779f10d2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
@@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     float res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        lapack_complex_float* a_t = NULL;
         float* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_clantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (lapack_complex_float*)
-            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_clantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clascl.c b/lapack-netlib/LAPACKE/src/lapacke_clascl.c
index fdcb02947..4f4e0bf35 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_clascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_cgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_chb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
index 7b25815e7..1b4fed17a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_complex_float* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
index 2eb942e4e..771395e97 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c
@@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
index 44405c993..f4a0a4334 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_float* a,
+                            lapack_int nrhs, const lapack_complex_float* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_float* b, lapack_int ldb )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
index 8567a07d5..d914c1d69 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, lapack_complex_float* a,
+                                 lapack_int nrhs, const lapack_complex_float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* work )
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
index fd0a40c17..8ca652456 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
index c4ea703af..7b2e3a169 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
index ddae70345..faa3ef6d3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c
@@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
new file mode 100644
index 000000000..bb551fcbc
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_complex_float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_C2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_float*)
+        LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
new file mode 100644
index 000000000..96b18ab13
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c
@@ -0,0 +1,109 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_float* a, lapack_int lda,
+                                      const lapack_complex_float* t, lapack_int ldt,
+                                      lapack_complex_float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
index d9fb2dca0..71ad23f2f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c
@@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans,
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
         r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) {
             return -7;
         }
         if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
index 7dbc9bb88..671def1df 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c
@@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_dgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_dgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
new file mode 100644
index 000000000..cf0e3200c
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c
@@ -0,0 +1,79 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               double* a, lapack_int lda,
+                               double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
new file mode 100644
index 000000000..f91887ffe
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    double* a, lapack_int lda,
+                                    double* t, lapack_int ldt,
+                                    double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        double* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
index f1a505486..88f4489a3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
index 5b2a6c535..9c9b0ea8b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
@@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     double res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        double* a_t = NULL;
         double* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_dlantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_dlantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
index 5b579a5d1..058105127 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_dlascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_dgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_dsb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
index 4b59fe627..f1444b5e2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         double* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
index 86184b784..587805de6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
new file mode 100644
index 000000000..1da3405a8
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c
@@ -0,0 +1,82 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 double* a, lapack_int lda,
+                                 const double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
new file mode 100644
index 000000000..e16467f3a
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      double* a, lapack_int lda,
+                                      const double* t, lapack_int ldt,
+                                      double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        double* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
index db75a6609..0b1c54b9b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c
@@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans,
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
         r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) {
             return -7;
         }
         if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
index cca9be489..36ff7c40c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
index f696c608f..78f9e80ed 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
@@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
index 6f9c02f6a..d68989aa6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
index 81ba2acb3..25d075d46 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
index 800a30b24..69b90e758 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo,
         if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
index 533b6a446..4ece69794 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
index 974b63e54..0016a7d06 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
index 51f333359..0db0cfa67 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
@@ -51,10 +51,10 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c
index 02d54d7fa..54fa6ff36 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_dsygvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
index 4d73ef3c1..46c90190f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, double* a, lapack_int lda,
+                            lapack_int nrhs, const double* a, lapack_int lda,
                             const lapack_int* ipiv, double* b, lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
index caffa5b4b..c937c39c5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, double* a,
+                                 lapack_int nrhs, const double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  double* b, lapack_int ldb, double* work )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c
index 66d1e5a2c..de379a970 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_dtrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c
index 89f01dc95..d17593471 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_dtrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c
index 9dc5509c9..941d83cad 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c
@@ -54,6 +54,8 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -71,7 +73,7 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_sgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_sgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c
new file mode 100644
index 000000000..759afce48
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c
@@ -0,0 +1,79 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               float* a, lapack_int lda,
+                               float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    float* work = NULL;
+    float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+        /* Allocate memory for work arrays */
+    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c
new file mode 100644
index 000000000..40193008d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    float* a, lapack_int lda,
+                                    float* t, lapack_int ldt,
+                                    float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        float* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c
index e60167001..cdec2c967 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_slacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_slacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
index e1d4c270d..f77abef2c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
@@ -40,44 +40,46 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     float res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        float* a_t = NULL;
         float* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_slantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_slantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_slantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slascl.c b/lapack-netlib/LAPACKE/src/lapacke_slascl.c
index 25bd9624e..62f7390ed 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_slascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_sgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_ssb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c
index c89c9a6e1..4f2fa7b67 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c
@@ -41,9 +41,6 @@ lapack_int LAPACKE_slaset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_slaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         float* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c
index 90dc435c9..804b7f8ef 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_s_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c
new file mode 100644
index 000000000..350783a78
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c
@@ -0,0 +1,82 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 float* a, lapack_int lda,
+                                 const float* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    float* work = NULL;
+    float work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = (lapack_int)work_query;
+    /* Allocate memory for work arrays */
+    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c
new file mode 100644
index 000000000..a66f70b52
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sorgtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      float* a, lapack_int lda,
+                                      const float* t, lapack_int ldt,
+                                      float* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        float* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_sge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c
index 9f0e9fddf..6ffe144cc 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c
@@ -51,7 +51,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans,
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
         r = LAPACKE_lsame( side, 'l' ) ? m : n;
-        if( LAPACKE_sge_nancheck( matrix_layout, r, r, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, r, a, lda ) ) {
             return -7;
         }
         if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c
index 5fd0a78c5..ac41a354d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_ssyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
index abd62ddf3..1889a337c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
@@ -72,7 +72,7 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
index d9fe47599..faadc92f1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
index bfbf49aee..434b52c01 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c
index 7b97f472b..4fb55960c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c
@@ -47,7 +47,7 @@ lapack_int LAPACKE_ssygst( int matrix_layout, lapack_int itype, char uplo,
         if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c
index 8ec40d954..f139de1ab 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c
index a2eba6653..195fb1e54 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c
@@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
index 5afe8d2de..e33ce2a7b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
@@ -51,10 +51,10 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c
index 1fe4e2c6c..8ffd9dc40 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_ssygvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
index 19f447cd8..a95a71469 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, float* a, lapack_int lda,
+                            lapack_int nrhs, const float* a, lapack_int lda,
                             const lapack_int* ipiv, float* b, lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
index 7d348b382..cf98f443d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, float* a,
+                                 lapack_int nrhs, const float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  float* b, lapack_int ldb, float* work )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttf.c b/lapack-netlib/LAPACKE/src/lapacke_strttf.c
index fee7ab9ae..e3304fbe7 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_strttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_strttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_strttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttp.c b/lapack-netlib/LAPACKE/src/lapacke_strttp.c
index 6c4b84aa3..2df79eb05 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_strttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_strttp.c
@@ -43,7 +43,7 @@ lapack_int LAPACKE_strttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c
index 2d7c2b6f3..da73cd479 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c
@@ -56,6 +56,8 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt,
                              ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
         lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n :
                               ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1);
+        lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) ||
+                               LAPACKE_lsame( jobvt, 's' ) ) ? n : 1;
         lapack_int lda_t = MAX(1,m);
         lapack_int ldu_t = MAX(1,nrows_u);
         lapack_int ldvt_t = MAX(1,nrows_vt);
@@ -73,7 +75,7 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt,
             LAPACKE_xerbla( "LAPACKE_zgesvd_work", info );
             return info;
         }
-        if( ldvt < n ) {
+        if( ldvt < ncols_vt ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_zgesvd_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c
new file mode 100644
index 000000000..53557c92d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c
@@ -0,0 +1,80 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n,
+                               lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                               lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_double* work = NULL;
+    lapack_complex_double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -7;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_Z2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_double*)
+        LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2,
+                                    a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c
new file mode 100644
index 000000000..a6825df56
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c
@@ -0,0 +1,108 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgetsqrhrt
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n,
+                                    lapack_int mb1, lapack_int nb1, lapack_int nb2,
+                                    lapack_complex_double* a, lapack_int lda,
+                                    lapack_complex_double* t, lapack_int ldt,
+                                    lapack_complex_double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_double* a_t = NULL;
+        lapack_int ldt_t = MAX(1,nb2);
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t,
+                               work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t,
+                           work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
index d4e93aed2..8b7aa3518 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
index fb33c3e2a..840c53876 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else {
             LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
index 5af2a1269..b8509e04f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        if ( jobz == 'V') {
+        if ( jobz == 'V' || jobz == 'v' ) {
             LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
         } else { 
             LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
index 8c4a5c374..aa2d84d84 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, lapack_complex_double* b,
+                           lapack_int lda, const lapack_complex_double* b,
                            lapack_int ldb )
 {
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
index 62fce1f27..f77894204 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, lapack_complex_double* b,
+                                lapack_int lda, const lapack_complex_double* b,
                                 lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c
index 683fcf487..587e2d4be 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c
index 0f1b415a9..43569d99e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c
@@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv_2stage( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
index 1242a0eda..c287595ad 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
@@ -55,10 +55,10 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -6;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -8;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c
index 492bc4dad..83f2bda2e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c
@@ -61,7 +61,7 @@ lapack_int LAPACKE_zhegvx( int matrix_layout, lapack_int itype, char jobz,
         if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) {
             return -15;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) {
             return -9;
         }
         if( LAPACKE_lsame( range, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c
index a07bc8d52..15a8cc576 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c
@@ -46,7 +46,7 @@ lapack_int LAPACKE_zhetri2x( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c
index bb4e57b1e..fe36ed811 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_zlacpy_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_zlacpy( &uplo, &m, &n, a, &lda, b, &ldb );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_int ldb_t = MAX(1,m);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
index e62f8a4e3..cccc4053e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
@@ -41,45 +41,46 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo,
     lapack_int info = 0;
     double res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
-        /* Call LAPACK function and adjust info */
+        /* Call LAPACK function */
         res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,m);
-        lapack_complex_double* a_t = NULL;
         double* work_lapack = NULL;
+        char norm_lapack;
+        char uplo_lapack;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
             LAPACKE_xerbla( "LAPACKE_zlantr_work", info );
             return info;
         }
-        /* Allocate memory for temporary array(s) */
-        a_t = (lapack_complex_double*)
-            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) );
-        if( a_t == NULL ) {
-            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
-            goto exit_level_0;
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        if( LAPACKE_lsame( uplo, 'u' ) ) {
+            uplo_lapack = 'l';
+        } else {
+            uplo_lapack = 'u';
         }
         /* Allocate memory for work array(s) */
-        if( LAPACKE_lsame( norm, 'i' ) ) {
-            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
             if( work_lapack == NULL ) {
                 info = LAPACK_WORK_MEMORY_ERROR;
-                goto exit_level_1;
+                goto exit_level_0;
             }
         }
-        /* Transpose input matrices */
-        LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
-        /* Call LAPACK function and adjust info */
-        res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
+        /* Call LAPACK function */
+        res = LAPACK_zlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack );
         /* Release memory and exit */
         if( work_lapack ) {
             LAPACKE_free( work_lapack );
         }
-exit_level_1:
-        LAPACKE_free( a_t );
 exit_level_0:
-        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+        if( info == LAPACK_WORK_MEMORY_ERROR ) {
             LAPACKE_xerbla( "LAPACKE_zlantr_work", info );
         }
     } else {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c
index 7e37d559c..8bf1ee767 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c
@@ -83,6 +83,7 @@ lapack_int LAPACKE_zlascl( int matrix_layout, char type, lapack_int kl,
                 LAPACKE_zgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) {
                 return -9;
             }
+            break;
         case 'B':
             // TYPE = 'B' - lower part of symmetric band matrix (assume m==n)
             if( LAPACKE_zhb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c
index 9056e8fca..ecb6cba25 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c
@@ -42,9 +42,6 @@ lapack_int LAPACKE_zlaset_work( int matrix_layout, char uplo, lapack_int m,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_zlaset( &uplo, &m, &n, &alpha, &beta, a, &lda );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_complex_double* a_t = NULL;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c
index 2826efa53..074b15303 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c
@@ -45,7 +45,7 @@ lapack_int LAPACKE_zsyconv( int matrix_layout, char uplo, char way, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
index 7442702aa..3c85f9796 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, lapack_complex_double* a,
+                            lapack_int nrhs, const lapack_complex_double* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_double* b, lapack_int ldb )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
index ec05ce6d5..cdc97fa02 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
                                  lapack_int nrhs,
-                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* a, lapack_int lda,
                                  const lapack_int* ipiv,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work )
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c
index 8a5dfc271..8e8789ec6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttf( int matrix_layout, char transr, char uplo,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -5;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c
index 5dcf633bb..bd8485108 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c
@@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttp( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) {
             return -4;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c
index 51785347e..adfaa7db9 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c
@@ -48,7 +48,7 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -4;
         }
         if( LAPACKE_z_nancheck( n-1, tau, 1 ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c
new file mode 100644
index 000000000..71418fb84
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n,
+                                 lapack_int mb, lapack_int nb,
+                                 lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* t, lapack_int ldt )
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_complex_double* work = NULL;
+    lapack_complex_double work_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zungtsqr_row", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, nb, n, t, ldt ) ) {
+            return -8;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, &work_query, lwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork = LAPACK_Z2INT( work_query );
+    /* Allocate memory for work arrays */
+    work = (lapack_complex_double*)
+        LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb,
+                                      a, lda, t, ldt, work, lwork );
+    /* Release memory and exit */
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zungtsqr_row", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c
new file mode 100644
index 000000000..909855864
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c
@@ -0,0 +1,109 @@
+/*****************************************************************************
+  Copyright (c) 2020, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zungtsqr_row
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n,
+                                      lapack_int mb, lapack_int nb,
+                                      lapack_complex_double* a, lapack_int lda,
+                                      const lapack_complex_double* t, lapack_int ldt,
+                                      lapack_complex_double* work, lapack_int lwork )
+{
+    lapack_int info = 0;
+    if (matrix_layout == LAPACK_COL_MAJOR) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt,
+                             work, &lwork, &info);
+        if (info < 0) {
+            info = info - 1;
+        }
+    } else if (matrix_layout == LAPACK_ROW_MAJOR) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_complex_double* a_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+            return info;
+        }
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldt < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t,
+                                 work, &lwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        LAPACKE_zge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t,
+                             work, &lwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info );
+    }
+    return info;
+}
\ No newline at end of file
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 83baac875..d1ee96667 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -135,14 +135,14 @@ SLASRC_O = \
    slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
-   slarf.o  slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
+   slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
    slarrv.o slartv.o \
    slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
    slasyf_rk.o \
    slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \
    slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \
    sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \
-   sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \
+   sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \
    sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \
    sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \
    spbstf.o spbsv.o  spbsvx.o \
@@ -181,7 +181,7 @@ SLASRC_O = \
    sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \
    stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \
    sgelqt.o sgelqt3.o sgemlqt.o \
-   sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
+   sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
    sgelq.o slaswlq.o slamswlq.o sgemlq.o \
    stplqt.o stplqt2.o stpmlqt.o \
    sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \
@@ -250,7 +250,7 @@ CLASRC_O = \
    claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
-   clarf.o  clarfb.o clarfg.o clarft.o clarfgp.o \
+   clarf.o  clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
    clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
    clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
    claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@@ -278,7 +278,7 @@ CLASRC_O = \
    ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
    ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \
    cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
-   cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
+   cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
    cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
    cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \
    chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \
@@ -289,7 +289,7 @@ CLASRC_O = \
    cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \
    ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o \
    cgelqt.o cgelqt3.o cgemlqt.o \
-   cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
+   cgetsls.o cgetsqrhrt.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
    cgelq.o claswlq.o clamswlq.o cgemlq.o \
    ctplqt.o ctplqt2.o ctpmlqt.o \
    cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \
@@ -342,14 +342,14 @@ DLASRC_O = \
    dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
-   dlarf.o  dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
+   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
    dlargv.o dlarrv.o dlartv.o \
    dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
    dlasyf.o dlasyf_rook.o dlasyf_rk.o \
    dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \
    dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \
    dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \
-   dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \
+   dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \
    dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \
    dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \
    dpbstf.o dpbsv.o  dpbsvx.o \
@@ -389,7 +389,7 @@ DLASRC_O = \
    dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \
    dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \
    dgelqt.o dgelqt3.o dgemlqt.o \
-   dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
+   dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
    dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
    dtplqt.o dtplqt2.o dtpmlqt.o \
    dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \
@@ -455,7 +455,7 @@ ZLASRC_O = \
    zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
-   zlarcm.o zlarf.o  zlarfb.o \
+   zlarcm.o zlarf.o  zlarfb.o zlarfb_gett.o \
    zlarfg.o zlarft.o zlarfgp.o \
    zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
    zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
@@ -484,7 +484,7 @@ ZLASRC_O = \
    ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \
    ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \
    zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
-   zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
+   zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
    zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \
    zunmtr.o zupgtr.o \
    zupmtr.o izmax1.o dzsum1.o zstemr.o \
@@ -498,7 +498,7 @@ ZLASRC_O = \
    ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \
    ztplqt.o ztplqt2.o ztpmlqt.o \
    zgelqt.o zgelqt3.o zgemlqt.o \
-   zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
+   zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
    zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
    zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
diff --git a/lapack-netlib/SRC/cgeqrt2.f b/lapack-netlib/SRC/cgeqrt2.f
index 9ee3e4f79..11221636d 100644
--- a/lapack-netlib/SRC/cgeqrt2.f
+++ b/lapack-netlib/SRC/cgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup complexGEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f
index 07341593f..34a80beea 100644
--- a/lapack-netlib/SRC/cgesdd.f
+++ b/lapack-netlib/SRC/cgesdd.f
@@ -281,9 +281,9 @@
      $                   CUNGQR, CUNMBR, SBDSDC, SLASCL, XERBLA
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, SISNAN
       REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, SLAMCH, CLANGE
+      EXTERNAL           LSAME, SLAMCH, CLANGE, SISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -647,6 +647,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = CLANGE( 'M', M, N, A, LDA, DUM )
+      IF( SISNAN ( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f
new file mode 100644
index 000000000..4e4dc1d4a
--- /dev/null
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b CGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a complex M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in CGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of CGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup comlpexOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for CLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for CUNGTSQR_ROW;
+*        d) Diagonal D for CUNHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of CUNGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
+            DO J = I, N
+               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = CMPLX( LWORKOPT )
+      RETURN
+*
+*     End of CGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f
index 336f41909..9c8e0eec3 100644
--- a/lapack-netlib/SRC/cggglm.f
+++ b/lapack-netlib/SRC/cggglm.f
@@ -271,8 +271,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = CZERO
+         END DO
+         DO I = 1, P
+            Y(I) = CZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f
index 73d35621c..bcf5acd0b 100644
--- a/lapack-netlib/SRC/chgeqz.f
+++ b/lapack-netlib/SRC/chgeqz.f
@@ -319,13 +319,14 @@
       REAL               ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
      $                   C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
       COMPLEX            ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
-     $                   CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
-     $                   U12, X
+     $                   CTEMP3, ESHIFT, S, SHIFT, SIGNBC,
+     $                   U12, X, ABI12, Y
 *     ..
 *     .. External Functions ..
+      COMPLEX            CLADIV
       LOGICAL            LSAME
       REAL               CLANHS, SLAMCH
-      EXTERNAL           LSAME, CLANHS, SLAMCH
+      EXTERNAL           CLADIV, LSAME, CLANHS, SLAMCH
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARTG, CLASET, CROT, CSCAL, XERBLA
@@ -350,6 +351,7 @@
          ILSCHR = .TRUE.
          ISCHUR = 2
       ELSE
+         ILSCHR = .TRUE.
          ISCHUR = 0
       END IF
 *
@@ -363,6 +365,7 @@
          ILQ = .TRUE.
          ICOMPQ = 3
       ELSE
+         ILQ = .TRUE.
          ICOMPQ = 0
       END IF
 *
@@ -376,6 +379,7 @@
          ILZ = .TRUE.
          ICOMPZ = 3
       ELSE
+         ILZ = .TRUE.
          ICOMPZ = 0
       END IF
 *
@@ -729,22 +733,34 @@
             AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
      $             ( BSCALE*T( ILAST, ILAST ) )
             ABI22 = AD22 - U12*AD21
+            ABI12 = AD12 - U12*AD11
 *
-            T1 = HALF*( AD11+ABI22 )
-            RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
-            TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) +
-     $             AIMAG( T1-ABI22 )*AIMAG( RTDISC )
-            IF( TEMP.LE.ZERO ) THEN
-               SHIFT = T1 + RTDISC
-            ELSE
-               SHIFT = T1 - RTDISC
+            SHIFT = ABI22
+            CTEMP = SQRT( ABI12 )*SQRT( AD21 )
+            TEMP = ABS1( CTEMP )
+            IF( CTEMP.NE.ZERO ) THEN
+               X = HALF*( AD11-SHIFT )
+               TEMP2 = ABS1( X )
+               TEMP = MAX( TEMP, ABS1( X ) )
+               Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
+               IF( TEMP2.GT.ZERO ) THEN
+                  IF( REAL( X / TEMP2 )*REAL( Y )+
+     $                AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y
+               END IF
+               SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) )
             END IF
          ELSE
 *
 *           Exceptional shift.  Chosen for no particularly good reason.
 *
-            ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/
-     $                        (BSCALE*T(ILAST-1,ILAST-1))
+            IF( ( IITER / 20 )*20.EQ.IITER .AND. 
+     $         BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN
+               ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
+     $            ILAST ) )/( BSCALE*T( ILAST, ILAST ) )
+            ELSE
+               ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
+     $            ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) )
+            END IF
             SHIFT = ESHIFT
          END IF
 *
diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f
index cfcf725b2..32b6fa87b 100644
--- a/lapack-netlib/SRC/chseqr.f
+++ b/lapack-netlib/SRC/chseqr.f
@@ -320,10 +320,10 @@
 *     .    CLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare CLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare CLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f
index 2f0ea20db..233721352 100644
--- a/lapack-netlib/SRC/claqr0.f
+++ b/lapack-netlib/SRC/claqr0.f
@@ -260,7 +260,7 @@
 *     .    CLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -355,22 +355,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -418,7 +418,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -558,7 +558,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use CLAQR4 or
 *                 .    CLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -659,7 +659,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f
index fba286df7..94484e798 100644
--- a/lapack-netlib/SRC/claqr4.f
+++ b/lapack-netlib/SRC/claqr4.f
@@ -270,7 +270,7 @@
 *     .    CLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -365,22 +365,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -428,7 +428,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -568,7 +568,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use CLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -663,7 +663,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f
index e4317a3ad..71f26d8c9 100644
--- a/lapack-netlib/SRC/claqr5.f
+++ b/lapack-netlib/SRC/claqr5.f
@@ -69,10 +69,9 @@
 *>             matrix entries.
 *>        = 1: CLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: CLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -170,14 +169,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is COMPLEX array, dimension (LDU,3*NSHFTS-3)
+*>          U is COMPLEX array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -189,7 +188,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is COMPLEX array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is COMPLEX array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -215,7 +214,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -226,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup complexOTHERauxiliary
 *
@@ -235,6 +234,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -244,10 +248,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
      $                   H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV,
      $                   WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -276,11 +285,11 @@
       COMPLEX            ALPHA, BETA, CDUM, REFSUM
       REAL               H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
      $                   SMLNUM, TST1, TST2, ULP
-      INTEGER            I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -334,10 +343,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -349,28 +354,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL CLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -379,24 +395,156 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 10 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
+     $                         S( 2*M22 ), V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*CONJG( V( 2, M22 ) )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = CONJG( V( 1, M22 ) )*
+     $                     ( H( K+1, J )+CONJG( V( 2, M22 ) )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
+                     IF( TST1.EQ.RZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + CABS1( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + CABS1( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + CABS1( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + CABS1( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + CABS1( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + CABS1( H( K+4, K+1 ) )
+                     END IF
+                     IF( CABS1( H( K+1, K ) )
+     $                   .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN
+                        H12 = MAX( CABS1( H( K+1, K ) ),
+     $                        CABS1( H( K, K+1 ) ) )
+                        H21 = MIN( CABS1( H( K+1, K ) ),
+     $                        CABS1( H( K, K+1 ) ) )
+                        H11 = MAX( CABS1( H( K+1, K+1 ) ),
+     $                        CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( CABS1( H( K+1, K+1 ) ),
+     $                        CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*CONJG( V( 2, M22 ) )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*CONJG( V( 2, M22 ) )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL CLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ),
      $                         S( 2*M ), V( 1, M ) )
                   ALPHA = V( 1, M )
                   CALL CLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) -
+     $                            REFSUM*CONJG( V( 3, M ) )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL CLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -444,7 +592,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -458,163 +606,32 @@
                      END IF
                   END IF
                END IF
-   10       CONTINUE
-*
-*           ==== Generate a 2-by-2 reflection, if needed. ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
-     $                         S( 2*M22 ), V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
-*
-*           ==== Multiply H by reflections from the left ====
-*
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 30 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 20 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = CONJG( V( 1, M ) )*
-     $                     ( H( K+1, J )+CONJG( V( 2, M ) )*H( K+2, J )+
-     $                     CONJG( V( 3, M ) )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   20          CONTINUE
-   30       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 40 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = CONJG( V( 1, M22 ) )*
-     $                     ( H( K+1, J )+CONJG( V( 2, M22 ) )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   40          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 80 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 50 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*CONJG( V( 2, M ) )
-                     H( J, K+3 ) = H( J, K+3 ) -
-     $                             REFSUM*CONJG( V( 3, M ) )
-   50             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 60 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*CONJG( V( 2, M ) )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) -
-     $                                  REFSUM*CONJG( V( 3, M ) )
-   60                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 70 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*CONJG( V( 2, M ) )
-                        Z( J, K+3 ) = Z( J, K+3 ) -
-     $                                REFSUM*CONJG( V( 3, M ) )
-   70                CONTINUE
-                  END IF
-               END IF
-   80       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 90 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*CONJG( V( 2, M22 ) )
-   90             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 100 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*CONJG( V( 2, M22 ) )
-  100                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 110 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*CONJG( V( 2, M22 ) )
-  110                CONTINUE
-                  END IF
-               END IF
-            END IF
 *
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 120 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====
+*
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*CONJG( V( 2, M ) )
+                  H( J, K+3 ) = H( J, K+3 ) -
+     $                          REFSUM*CONJG( V( 3, M ) )
+   70          CONTINUE
+*
+*              ==== Perform update from left for subsequent
+*              .    column. ====
+*
+               REFSUM =  CONJG( V( 1, M ) )*( H( K+1, K+1 )
+     $                  +CONJG( V( 2, M ) )*H( K+2, K+1 )
+     $                  +CONJG( V( 3, M ) )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -625,6 +642,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
                   IF( TST1.EQ.RZERO ) THEN
@@ -658,22 +677,77 @@
      $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
                   END IF
                END IF
-  120       CONTINUE
+   80       CONTINUE
+*
+*           ==== Multiply H by reflections from the left ====
+*
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = CONJG( V( 1, M ) )*
+     $                     ( H( K+1, J )+CONJG( V( 2, M ) )*
+     $                     H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
+*
+*           ==== Accumulate orthogonal transformations. ====
 *
-*           ==== Fill in the last row of each bulge. ====
+            IF( ACCUM ) THEN
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 130 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*CONJG( V( 2, M ) )
-               H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*CONJG( V( 3, M ) )
-  130       CONTINUE
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*CONJG( V( 2, M ) )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) -
+     $                               REFSUM*CONJG( V( 3, M ) )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*CONJG( V( 2, M ) )
+                     Z( J, K+3 ) = Z( J, K+3 ) -
+     $                             REFSUM*CONJG( V( 3, M ) )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  140    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -687,220 +761,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
-*
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
-*
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
-*
-*              ==== Horizontal Multiply ====
-*
-               DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
-     $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
-     $                        LDWH )
-                  CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH,
-     $                         H( INCOL+K1, JCOL ), LDH )
-  150          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+*
+*           ==== Horizontal Multiply ====
+*
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+     $                     LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+     $                     LDWH )
+               CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH,
+     $                      H( INCOL+K1, JCOL ), LDH )
+  150       CONTINUE
+*
+*           ==== Vertical multiply ====
+*
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
+*
+*           ==== Z multiply (also vertical) ====
+*
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
-  160          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 170 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  170             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL CLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL CLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL CTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**H ====
-*
-                  CALL CGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL CLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL CTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL CGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL CLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  180          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL CLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL CLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  190          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 200 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL CLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL CLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  200             CONTINUE
-               END IF
+     $                         Z( JROW, INCOL+K1 ), LDZ )
+  170          CONTINUE
             END IF
          END IF
-  210 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of CLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f
new file mode 100644
index 000000000..ee6959ed8
--- /dev/null
+++ b/lapack-netlib/SRC/clarfb_gett.f
@@ -0,0 +1,597 @@
+*> \brief \b CLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLARFB_GETT applies a complex Householder block reflector H from the
+*> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**H)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*A1_in
+*>
+*>       B1_out: = - V2*T*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**H) * W2
+*>                      = (unit_lower_tr_of_(A1)**H) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**H) * W1
+*>                    = (unit_lower_tr_of_(A1)**H) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CGEMM, CTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
+     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = CZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of CLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f
index a45f55ac3..26a9febc8 100644
--- a/lapack-netlib/SRC/clarrv.f
+++ b/lapack-netlib/SRC/clarrv.f
@@ -351,7 +351,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0) .OR. (M.LE.0) ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/ctgsja.f b/lapack-netlib/SRC/ctgsja.f
index 38a61068e..c96cbe022 100644
--- a/lapack-netlib/SRC/ctgsja.f
+++ b/lapack-netlib/SRC/ctgsja.f
@@ -401,7 +401,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      REAL               ZERO, ONE
+      REAL               ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
       COMPLEX            CZERO, CONE
       PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
@@ -424,7 +424,8 @@
      $                   SLARTG, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, CONJG, MAX, MIN, REAL
+      INTRINSIC          ABS, CONJG, MAX, MIN, REAL, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -610,9 +611,9 @@
 *
          A1 = REAL( A( K+I, N-L+I ) )
          B1 = REAL( B( I, N-L+I ) )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
             IF( GAMMA.LT.ZERO ) THEN
                CALL CSSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB )
diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f
index df25799ca..0dddd42a6 100644
--- a/lapack-netlib/SRC/cungbr.f
+++ b/lapack-netlib/SRC/cungbr.f
@@ -222,8 +222,8 @@
                CALL CUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL CUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL CUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -231,8 +231,8 @@
                CALL CUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL CUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL CUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/cungtsqr_row.f b/lapack-netlib/SRC/cungtsqr_row.f
new file mode 100644
index 000000000..e1597c58b
--- /dev/null
+++ b/lapack-netlib/SRC/cungtsqr_row.f
@@ -0,0 +1,380 @@
+*> \brief \b CUNGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CUNGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
+*> orthonormal columns from the output of CLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by CLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of CLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine CLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which CLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by CLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by CLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by CLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See CLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See CLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      COMPLEX            DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLARFB_GETT, CLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CUNGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL CLASET('U', M, N, CZERO, CONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = CMPLX( LWORKOPT )
+      RETURN
+*
+*     End of CUNGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/SRC/dgeqrt2.f b/lapack-netlib/SRC/dgeqrt2.f
index 138dd4d9c..00f800d43 100644
--- a/lapack-netlib/SRC/dgeqrt2.f
+++ b/lapack-netlib/SRC/dgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup doubleGEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f
index 0218900d2..80d18041c 100644
--- a/lapack-netlib/SRC/dgesdd.f
+++ b/lapack-netlib/SRC/dgesdd.f
@@ -267,9 +267,9 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, DISNAN
       DOUBLE PRECISION   DLAMCH, DLANGE
-      EXTERNAL           DLAMCH, DLANGE, LSAME
+      EXTERNAL           DLAMCH, DLANGE, LSAME, DISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -599,6 +599,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = DLANGE( 'M', M, N, A, LDA, DUM )
+      IF( DISNAN( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f
new file mode 100644
index 000000000..668deeba8
--- /dev/null
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b DGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a real M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in DGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of DGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for DLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for DORGTSQR_ROW;
+*        d) Diagonal D for DORHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of DORGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
+            DO J = I, N
+               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = DBLE( LWORKOPT )
+      RETURN
+*
+*     End of DGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f
index 2e92912e0..1fbdc8add 100644
--- a/lapack-netlib/SRC/dggglm.f
+++ b/lapack-netlib/SRC/dggglm.f
@@ -270,8 +270,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = ZERO
+         END DO
+         DO I = 1, P
+            Y(I) = ZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f
index b4fc3af90..6b7fb308f 100644
--- a/lapack-netlib/SRC/dhseqr.f
+++ b/lapack-netlib/SRC/dhseqr.f
@@ -338,10 +338,10 @@
 *     .    DLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare DLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare DLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f
index 61b016f16..1c277c6bb 100644
--- a/lapack-netlib/SRC/dlanv2.f
+++ b/lapack-netlib/SRC/dlanv2.f
@@ -139,7 +139,7 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION   ZERO, HALF, ONE
+      DOUBLE PRECISION   ZERO, HALF, ONE, TWO
       PARAMETER          ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0,
      $                     TWO = 2.0D0 )
       DOUBLE PRECISION   MULTPL
diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f
index f362c096c..8334d8d2b 100644
--- a/lapack-netlib/SRC/dlaqr0.f
+++ b/lapack-netlib/SRC/dlaqr0.f
@@ -278,7 +278,7 @@
 *     .    DLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -362,22 +362,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -425,7 +425,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -576,7 +576,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use DLAQR4 or
 *                 .    DLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -698,7 +698,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f
index 454bf9608..163e55deb 100644
--- a/lapack-netlib/SRC/dlaqr4.f
+++ b/lapack-netlib/SRC/dlaqr4.f
@@ -284,7 +284,7 @@
 *     .    DLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -368,22 +368,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -431,7 +431,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -582,7 +582,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use DLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -697,7 +697,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f
index f58db9c89..12e7db637 100644
--- a/lapack-netlib/SRC/dlaqr5.f
+++ b/lapack-netlib/SRC/dlaqr5.f
@@ -70,10 +70,9 @@
 *>             matrix entries.
 *>        = 1: DLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: DLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -178,14 +177,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is DOUBLE PRECISION array, dimension (LDU,3*NSHFTS-3)
+*>          U is DOUBLE PRECISION array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -197,7 +196,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is DOUBLE PRECISION array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is DOUBLE PRECISION array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -223,7 +222,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -234,7 +233,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup doubleOTHERauxiliary
 *
@@ -243,6 +242,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -252,10 +256,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
      $                   SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
      $                   LDU, NV, WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -282,11 +291,11 @@
       DOUBLE PRECISION   ALPHA, BETA, H11, H12, H21, H22, REFSUM,
      $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
      $                   ULP
-      INTEGER            I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -356,10 +365,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -371,28 +376,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -401,17 +417,134 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 20 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
+     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
+     $                         V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP ) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
+                     IF( TST1.EQ.ZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + ABS( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + ABS( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + ABS( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + ABS( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + ABS( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + ABS( H( K+4, K+1 ) )
+                     END IF
+                     IF( ABS( H( K+1, K ) )
+     $                   .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN
+                        H12 = MAX( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H21 = MIN( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H11 = MAX( ABS( H( K+1, K+1 ) ),
+     $                             ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( ABS( H( K+1, K+1 ) ),
+     $                        ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) ) THEN
+                           H( K+1, K ) = ZERO
+                        END IF
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ),
      $                         SI( 2*M-1 ), SR( 2*M ), SI( 2*M ),
@@ -419,7 +552,20 @@
                   ALPHA = V( 1, M )
                   CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*V( 2, M )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -467,7 +613,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -481,154 +627,29 @@
                      END IF
                   END IF
                END IF
-   20       CONTINUE
-*
-*           ==== Generate a 2-by-2 reflection, if needed. ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
-     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
-     $                         V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
-*
-*           ==== Multiply H by reflections from the left ====
-*
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 40 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 30 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
-     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   30          CONTINUE
-   40       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 50 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   50          CONTINUE
-            END IF
 *
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 90 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 60 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
-                     H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
-   60             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 70 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
-   70                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 80 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
-                        Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
-   80                CONTINUE
-                  END IF
-               END IF
-   90       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 100 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
-  100             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 110 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*V( 2, M22 )
-  110             CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 120 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
-  120                CONTINUE
-                  END IF
-               END IF
-            END IF
-*
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 130 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====      
+*
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
+   70          CONTINUE
+*
+*              ==== Perform update from left for subsequent
+*              .    column. ====
+*
+               REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
+     $                  H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -639,6 +660,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
                   IF( TST1.EQ.ZERO ) THEN
@@ -667,25 +690,77 @@
                      TST2 = H22*( H11 / SCL )
 *
                      IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
-     $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+     $                   MAX( SMLNUM, ULP*TST2 ) ) THEN
+                        H( K+1, K ) = ZERO
+                     END IF
                   END IF
                END IF
-  130       CONTINUE
+   80       CONTINUE
 *
-*           ==== Fill in the last row of each bulge. ====
+*           ==== Multiply H by reflections from the left ====
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 140 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*V( 2, M )
-               H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M )
-  140       CONTINUE
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
+     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
+*
+*           ==== Accumulate orthogonal transformations. ====
+*
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  150    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -699,220 +774,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
-*
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
-*
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
-*
-*              ==== Horizontal Multiply ====
-*
-               DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+*
+*           ==== Horizontal Multiply ====
+*
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
      $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
      $                        LDWH )
-                  CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH,
+               CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH,
      $                         H( INCOL+K1, JCOL ), LDH )
-  160          CONTINUE
+  150       CONTINUE
+*
+*           ==== Vertical multiply ====
+*
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
 *
-*              ==== Vertical multiply ====
+*           ==== Z multiply (also vertical) ====
 *
-               DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
+     $                         Z( JROW, INCOL+K1 ), LDZ )
   170          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 180 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  180             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**T ====
-*
-                  CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  190          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  200          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 210 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL DLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  210             CONTINUE
-               END IF
             END IF
          END IF
-  220 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of DLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f
new file mode 100644
index 000000000..10ab6461e
--- /dev/null
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@@ -0,0 +1,596 @@
+*> \brief \b DLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLARFB_GETT applies a real Householder block reflector H from the
+*> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**T)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T**A1_in
+*>
+*>       B1_out: = - V2*T**A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**T) * W2
+*>                      = (unit_lower_tr_of_(A1)**T) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**T) * W1
+*>                    = (unit_lower_tr_of_(A1)**T) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGEMM, DTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
+     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = ZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of DLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f
index 4a59a2bbf..a1c6e9c9d 100644
--- a/lapack-netlib/SRC/dlarrv.f
+++ b/lapack-netlib/SRC/dlarrv.f
@@ -353,7 +353,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0).OR.(M.LE.0) ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dlasq2.f b/lapack-netlib/SRC/dlasq2.f
index 68d922870..27eb1f79a 100644
--- a/lapack-netlib/SRC/dlasq2.f
+++ b/lapack-netlib/SRC/dlasq2.f
@@ -184,10 +184,18 @@
 *
 *        2-by-2 case.
 *
-         IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN
-            INFO = -2
+         IF( Z( 1 ).LT.ZERO ) THEN
+            INFO = -201
+            CALL XERBLA( 'DLASQ2', 2 )
+            RETURN
+         ELSE IF( Z( 2 ).LT.ZERO ) THEN
+            INFO = -202
             CALL XERBLA( 'DLASQ2', 2 )
             RETURN
+         ELSE IF( Z( 3 ).LT.ZERO ) THEN
+           INFO = -203
+           CALL XERBLA( 'DLASQ2', 2 )
+           RETURN
          ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN
             D = Z( 3 )
             Z( 3 ) = Z( 1 )
diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f
index cfebda5ab..6868fc38d 100644
--- a/lapack-netlib/SRC/dorgbr.f
+++ b/lapack-netlib/SRC/dorgbr.f
@@ -221,8 +221,8 @@
                CALL DORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL DORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL DORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -230,8 +230,8 @@
                CALL DORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL DORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL DORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/dorgtsqr_row.f b/lapack-netlib/SRC/dorgtsqr_row.f
new file mode 100644
index 000000000..94f8b0120
--- /dev/null
+++ b/lapack-netlib/SRC/dorgtsqr_row.f
@@ -0,0 +1,379 @@
+*> \brief \b DORGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DORGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dorgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DORGTSQR_ROW generates an M-by-N real matrix Q_out with
+*> orthonormal columns from the output of DLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by DLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of DLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine DLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which DLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by DLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by DLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by DLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See DLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See DLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLARFB_GETT, DLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DORGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL DLASET('U', M, N, ZERO, ONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = DBLE( LWORKOPT )
+      RETURN
+*
+*     End of DORGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/SRC/dtgsja.f b/lapack-netlib/SRC/dtgsja.f
index 66f32b790..537bd3f4f 100644
--- a/lapack-netlib/SRC/dtgsja.f
+++ b/lapack-netlib/SRC/dtgsja.f
@@ -400,7 +400,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      DOUBLE PRECISION   ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
@@ -419,7 +419,8 @@
      $                   DSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, MAX, MIN
+      INTRINSIC          ABS, MAX, MIN, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -596,9 +597,9 @@
 *
          A1 = A( K+I, N-L+I )
          B1 = B( I, N-L+I )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
 *           change sign if necessary
 *
diff --git a/lapack-netlib/SRC/sgeqrt2.f b/lapack-netlib/SRC/sgeqrt2.f
index 349fd4b60..f6532f812 100644
--- a/lapack-netlib/SRC/sgeqrt2.f
+++ b/lapack-netlib/SRC/sgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup realGEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f
index 689494dd1..89e03a002 100644
--- a/lapack-netlib/SRC/sgesdd.f
+++ b/lapack-netlib/SRC/sgesdd.f
@@ -267,9 +267,9 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, SISNAN
       REAL               SLAMCH, SLANGE
-      EXTERNAL           SLAMCH, SLANGE, LSAME
+      EXTERNAL           SLAMCH, SLANGE, LSAME, SISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -599,6 +599,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = SLANGE( 'M', M, N, A, LDA, DUM )
+      IF( SISNAN( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f
new file mode 100644
index 000000000..f9580da7b
--- /dev/null
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b SGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a complex M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in SGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of SGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup singleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for SLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for SORGTSQR_ROW;
+*        d) Diagonal D for SORHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of SORGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
+            DO J = I, N
+               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = REAL( LWORKOPT )
+      RETURN
+*
+*     End of SGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f
index fe63da5f5..572ee511d 100644
--- a/lapack-netlib/SRC/sggglm.f
+++ b/lapack-netlib/SRC/sggglm.f
@@ -270,8 +270,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = ZERO
+         END DO
+         DO I = 1, P
+            Y(I) = ZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f
index b5707f2c3..d22bd7b94 100644
--- a/lapack-netlib/SRC/shseqr.f
+++ b/lapack-netlib/SRC/shseqr.f
@@ -338,10 +338,10 @@
 *     .    SLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare SLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare SLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f
index e678305f2..375645b75 100644
--- a/lapack-netlib/SRC/slanv2.f
+++ b/lapack-netlib/SRC/slanv2.f
@@ -139,7 +139,7 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL               ZERO, HALF, ONE
+      REAL               ZERO, HALF, ONE, TWO
       PARAMETER          ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0,
      $                     TWO = 2.0E+0 )
       REAL               MULTPL
diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f
index 318b46943..b1ebaff75 100644
--- a/lapack-netlib/SRC/slaqr0.f
+++ b/lapack-netlib/SRC/slaqr0.f
@@ -277,7 +277,7 @@
 *     .    SLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -361,22 +361,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -424,7 +424,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -575,7 +575,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use SLAQR4 or
 *                 .    SLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -697,7 +697,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f
index cd642e07f..4ba2f8757 100644
--- a/lapack-netlib/SRC/slaqr4.f
+++ b/lapack-netlib/SRC/slaqr4.f
@@ -287,7 +287,7 @@
 *     .    SLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -371,22 +371,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -434,7 +434,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -585,7 +585,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use SLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -700,7 +700,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f
index f04ee577e..d60a1d3c0 100644
--- a/lapack-netlib/SRC/slaqr5.f
+++ b/lapack-netlib/SRC/slaqr5.f
@@ -70,10 +70,9 @@
 *>             matrix entries.
 *>        = 1: SLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: SLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -178,14 +177,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is REAL array, dimension (LDU,3*NSHFTS-3)
+*>          U is REAL array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -197,7 +196,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is REAL array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is REAL array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -223,7 +222,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -234,7 +233,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup realOTHERauxiliary
 *
@@ -243,6 +242,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -252,10 +256,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
      $                   SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
      $                   LDU, NV, WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -282,11 +291,11 @@
       REAL               ALPHA, BETA, H11, H12, H21, H22, REFSUM,
      $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
      $                   ULP
-      INTEGER            I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -356,10 +365,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -371,28 +376,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS-1 columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -401,17 +417,134 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 20 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
+     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
+     $                         V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP ) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
+                     IF( TST1.EQ.ZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + ABS( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + ABS( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + ABS( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + ABS( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + ABS( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + ABS( H( K+4, K+1 ) )
+                     END IF
+                     IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) )
+     $                    THEN
+                        H12 = MAX( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H21 = MIN( ABS( H( K+1, K ) ),
+     $                             ABS( H( K, K+1 ) ) )
+                        H11 = MAX( ABS( H( K+1, K+1 ) ),
+     $                        ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( ABS( H( K+1, K+1 ) ),
+     $                        ABS( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) ) THEN
+                           H( K+1, K ) = ZERO
+                        END IF
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ),
      $                         SI( 2*M-1 ), SR( 2*M ), SI( 2*M ),
@@ -419,7 +552,20 @@
                   ALPHA = V( 1, M )
                   CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*V( 2, M )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -467,7 +613,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -481,154 +627,29 @@
                      END IF
                   END IF
                END IF
-   20       CONTINUE
-*
-*           ==== Generate a 2-by-2 reflection, if needed. ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ),
-     $                         SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ),
-     $                         V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
 *
-*           ==== Multiply H by reflections from the left ====
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====      
 *
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 40 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 30 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
-     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   30          CONTINUE
-   40       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 50 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   50          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 90 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 60 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
      $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
-                     H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
-   60             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 70 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
-   70                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 80 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
-                        Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
-   80                CONTINUE
-                  END IF
-               END IF
-   90       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 100 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
-  100             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 110 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*
-     $                                  V( 2, M22 )
-  110                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 120 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
-  120                CONTINUE
-                  END IF
-               END IF
-            END IF
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
+   70          CONTINUE
 *
-*           ==== Vigilant deflation check ====
+*              ==== Perform update from left for subsequent
+*              .    column. ====
 *
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 130 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+               REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
+     $                  H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -639,6 +660,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) )
                   IF( TST1.EQ.ZERO ) THEN
@@ -667,25 +690,77 @@
                      TST2 = H22*( H11 / SCL )
 *
                      IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE.
-     $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+     $                   MAX( SMLNUM, ULP*TST2 ) ) THEN
+                        H( K+1, K ) = ZERO
+                     END IF
                   END IF
                END IF
-  130       CONTINUE
+   80       CONTINUE
+*
+*           ==== Multiply H by reflections from the left ====
+*
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
+*
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
+     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
 *
-*           ==== Fill in the last row of each bulge. ====
+*           ==== Accumulate orthogonal transformations. ====
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 140 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*V( 2, M )
-               H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M )
-  140       CONTINUE
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  150    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -699,220 +774,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
-*
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
-*
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
-*
-*              ==== Horizontal Multiply ====
-*
-               DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
-     $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
-     $                        LDWH )
-                  CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH,
-     $                         H( INCOL+K1, JCOL ), LDH )
-  160          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+*
+*           ==== Horizontal Multiply ====
+*
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+     $                     LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+     $                     LDWH )
+               CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH,
+     $                      H( INCOL+K1, JCOL ), LDH )
+  150       CONTINUE
+*
+*           ==== Vertical multiply ====
+*
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
+*
+*           ==== Z multiply (also vertical) ====
+*
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
+     $                         Z( JROW, INCOL+K1 ), LDZ )
   170          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 180 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  180             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL SLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**T ====
-*
-                  CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL SLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**T ====
-*
-                  CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL SLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  190          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL SLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL SLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  200          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 210 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL SLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL SLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  210             CONTINUE
-               END IF
             END IF
          END IF
-  220 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of SLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f
new file mode 100644
index 000000000..7719f2965
--- /dev/null
+++ b/lapack-netlib/SRC/slarfb_gett.f
@@ -0,0 +1,596 @@
+*> \brief \b SLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLARFB_GETT applies a real Householder block reflector H from the
+*> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup singleOTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**T)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T**A1_in
+*>
+*>       B1_out: = - V2*T**A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**T) * W2
+*>                      = (unit_lower_tr_of_(A1)**T) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**T) * W1
+*>                    = (unit_lower_tr_of_(A1)**T) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGEMM, STRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
+     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = ZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of SLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f
index 04519fde8..9448b2fd9 100644
--- a/lapack-netlib/SRC/slarrv.f
+++ b/lapack-netlib/SRC/slarrv.f
@@ -353,7 +353,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0).OR.(M.LE.0) ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f
index 6e5f86447..c0c71b82e 100644
--- a/lapack-netlib/SRC/slasq2.f
+++ b/lapack-netlib/SRC/slasq2.f
@@ -183,10 +183,18 @@
 *
 *        2-by-2 case.
 *
-         IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN
-            INFO = -2
+         IF( Z( 1 ).LT.ZERO ) THEN
+            INFO = -201
+            CALL XERBLA( 'SLASQ2', 2 )
+            RETURN
+         ELSE IF( Z( 2 ).LT.ZERO ) THEN
+            INFO = -202
             CALL XERBLA( 'SLASQ2', 2 )
             RETURN
+         ELSE IF( Z( 3 ).LT.ZERO ) THEN
+           INFO = -203
+           CALL XERBLA( 'SLASQ2', 2 )
+           RETURN
          ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN
             D = Z( 3 )
             Z( 3 ) = Z( 1 )
diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f
index dccdbb58a..2266505dc 100644
--- a/lapack-netlib/SRC/sorgbr.f
+++ b/lapack-netlib/SRC/sorgbr.f
@@ -221,8 +221,8 @@
                CALL SORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL SORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL SORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -230,8 +230,8 @@
                CALL SORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL SORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL SORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f
new file mode 100644
index 000000000..d2a2150cd
--- /dev/null
+++ b/lapack-netlib/SRC/sorgtsqr_row.f
@@ -0,0 +1,379 @@
+*> \brief \b SORGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SORGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sorgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sorgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sorgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SORGTSQR_ROW generates an M-by-N real matrix Q_out with
+*> orthonormal columns from the output of SLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by SLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of SLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine SLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which SLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by SLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by SLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by SLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See SLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See SLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup sigleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      REAL               DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLARFB_GETT, SLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SORGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL SLASET('U', M, N, ZERO, ONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = REAL( LWORKOPT )
+      RETURN
+*
+*     End of SORGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/SRC/stgsja.f b/lapack-netlib/SRC/stgsja.f
index 2a6fc354d..7324da431 100644
--- a/lapack-netlib/SRC/stgsja.f
+++ b/lapack-netlib/SRC/stgsja.f
@@ -400,7 +400,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      REAL               ZERO, ONE
+      REAL               ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *     ..
 *     .. Local Scalars ..
@@ -419,7 +419,8 @@
      $                   SSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, MAX, MIN
+      INTRINSIC          ABS, MAX, MIN, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -596,9 +597,9 @@
 *
          A1 = A( K+I, N-L+I )
          B1 = B( I, N-L+I )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
 *           change sign if necessary
 *
diff --git a/lapack-netlib/SRC/zgeqrt2.f b/lapack-netlib/SRC/zgeqrt2.f
index bad708498..34d9d544f 100644
--- a/lapack-netlib/SRC/zgeqrt2.f
+++ b/lapack-netlib/SRC/zgeqrt2.f
@@ -97,8 +97,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
-*
 *> \ingroup complex16GEcomputational
 *
 *> \par Further Details:
@@ -127,10 +125,9 @@
 *  =====================================================================
       SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       INTEGER   INFO, LDA, LDT, M, N
@@ -157,10 +154,10 @@
 *     Test the input arguments
 *
       INFO = 0
-      IF( M.LT.0 ) THEN
-         INFO = -1
-      ELSE IF( N.LT.0 ) THEN
+      IF( N.LT.0 ) THEN
          INFO = -2
+      ELSE IF( M.LT.N ) THEN
+         INFO = -1
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
       ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f
index bb9d2c26e..2209f4733 100644
--- a/lapack-netlib/SRC/zgesdd.f
+++ b/lapack-netlib/SRC/zgesdd.f
@@ -281,9 +281,9 @@
      $                   ZLASET, ZUNGBR, ZUNGLQ, ZUNGQR, ZUNMBR
 *     ..
 *     .. External Functions ..
-      LOGICAL            LSAME
+      LOGICAL            LSAME, DISNAN
       DOUBLE PRECISION   DLAMCH, ZLANGE
-      EXTERNAL           LSAME, DLAMCH, ZLANGE
+      EXTERNAL           LSAME, DLAMCH, ZLANGE, DISNAN
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN, SQRT
@@ -647,6 +647,10 @@
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
       ANRM = ZLANGE( 'M', M, N, A, LDA, DUM )
+      IF( DISNAN( ANRM ) ) THEN
+          INFO = -4
+          RETURN
+      END IF
       ISCL = 0
       IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
          ISCL = 1
diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f
new file mode 100644
index 000000000..5f0167937
--- /dev/null
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@@ -0,0 +1,349 @@
+*> \brief \b ZGETSQRHRT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGETSQRHRT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgetsqrhrt.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgetsqrhrt.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgetsqrhrt.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+*      $                       LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization
+*> of a complex M-by-N matrix A with M >= N,
+*>
+*>    A = Q * R.
+*>
+*> The routine uses internally a NB1-sized column blocked and MB1-sized
+*> row blocked TSQR-factorization and perfors the reconstruction
+*> of the Householder vectors from the TSQR output. The routine also
+*> converts the R_tsqr factor from the TSQR-factorization output into
+*> the R factor that corresponds to the Householder QR-factorization,
+*>
+*>    A = Q_tsqr * R_tsqr = Q * R.
+*>
+*> The output Q and R factors are stored in the same format as in ZGEQRT
+*> (Q is in blocked compact WY-representation). See the documentation
+*> of ZGEQRT for more details on the format.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          The row block size to be used in the blocked TSQR.
+*>          MB1 > N.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          The column block size to be used in the blocked TSQR.
+*>          N >= NB1 >= 1.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          The block size to be used in the blocked QR that is
+*>          output. NB2 >= 1.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry: an M-by-N matrix A.
+*>
+*>          On exit:
+*>           a) the elements on and above the diagonal
+*>              of the array contain the N-by-N upper-triangular
+*>              matrix R corresponding to the Householder QR;
+*>           b) the elements below the diagonal represent Q by
+*>              the columns of blocked V (compact WY-representation).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,N))
+*>          The upper triangular block reflectors stored in compact form
+*>          as a sequence of upper triangular blocks.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.  LDT >= NB2.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          where
+*>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
+*>             NB1LOCAL = MIN(NB1,N).
+*>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
+*>             LW1 = NB1LOCAL * N,
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup comlpex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
+     $                       LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
+     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB1.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB1.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( NB2.LT.1 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+         INFO = -9
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array:
+*        a) Matrix T and WORK for ZLATSQR;
+*        b) N-by-N upper-triangular factor R_tsqr;
+*        c) Matrix T and array WORK for ZUNGTSQR_ROW;
+*        d) Diagonal D for ZUNHR_COL.
+*
+         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
+            INFO = -11
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NB1LOCAL = MIN( NB1, N )
+*
+            NUM_ALL_ROW_BLOCKS = MAX( 1,
+     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+*           Length and leading dimension of WORK array to place
+*           T array in TSQR.
+*
+            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL
+
+            LDWT = NB1LOCAL
+*
+*           Length of TSQR work array
+*
+            LW1 = NB1LOCAL * N
+*
+*           Length of ZUNGTSQR_ROW work array.
+*
+            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
+*
+            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -11
+            END IF
+*
+         END IF
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGETSQRHRT', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+      NB2LOCAL = MIN( NB2, N )
+*
+*
+*     (1) Perform TSQR-factorization of the M-by-N matrix A.
+*
+      CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $              WORK(LWT+1), LW1, IINFO )
+*
+*     (2) Copy the factor R_tsqr stored in the upper-triangular part
+*         of A into the square matrix in the work array
+*         WORK(LWT+1:LWT+N*N) column-by-column.
+*
+      DO J = 1, N
+         CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
+      END DO
+*
+*     (3) Generate a M-by-N matrix Q with orthonormal columns from
+*     the result stored below the diagonal in the array A in place.
+*
+
+      CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
+     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
+*
+*     (4) Perform the reconstruction of Householder vectors from
+*     the matrix Q (stored in A) in place.
+*
+      CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
+     $                WORK( LWT+N*N+1 ), IINFO )
+*
+*     (5) Copy the factor R_tsqr stored in the square matrix in the
+*     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
+*     part of A.
+*
+*     (6) Compute from R_tsqr the factor R_hr corresponding to
+*     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
+*     This multiplication by the sign matrix S on the left means
+*     changing the sign of I-th row of the matrix R_tsqr according
+*     to sign of the I-th diagonal element DIAG(I) of the matrix S.
+*     DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output.
+*
+*     (5) and (6) can be combined in a single loop, so the rows in A
+*     are accessed only once.
+*
+      DO I = 1, N
+         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
+            DO J = I, N
+               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
+            END DO
+         ELSE
+            CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
+         END IF
+      END DO
+*
+      WORK( 1 ) = DCMPLX( LWORKOPT )
+      RETURN
+*
+*     End of ZGETSQRHRT
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f
index d6a30cee7..d4adc5c4d 100644
--- a/lapack-netlib/SRC/zggglm.f
+++ b/lapack-netlib/SRC/zggglm.f
@@ -271,8 +271,15 @@
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 )
-     $   RETURN
+      IF( N.EQ.0 ) THEN
+         DO I = 1, M
+            X(I) = CZERO
+         END DO
+         DO I = 1, P
+            Y(I) = CZERO
+         END DO
+         RETURN
+      END IF
 *
 *     Compute the GQR factorization of matrices A and B:
 *
diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f
index b51cba4f7..960244727 100644
--- a/lapack-netlib/SRC/zhgeqz.f
+++ b/lapack-netlib/SRC/zhgeqz.f
@@ -319,13 +319,14 @@
       DOUBLE PRECISION   ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL,
      $                   C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
       COMPLEX*16         ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
-     $                   CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
-     $                   U12, X
+     $                   CTEMP3, ESHIFT, S, SHIFT, SIGNBC,
+     $                   U12, X, ABI12, Y
 *     ..
 *     .. External Functions ..
+      COMPLEX*16         ZLADIV
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, ZLANHS
-      EXTERNAL           LSAME, DLAMCH, ZLANHS
+      EXTERNAL           ZLADIV, LSAME, DLAMCH, ZLANHS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL
@@ -351,6 +352,7 @@
          ILSCHR = .TRUE.
          ISCHUR = 2
       ELSE
+         ILSCHR = .TRUE.
          ISCHUR = 0
       END IF
 *
@@ -364,6 +366,7 @@
          ILQ = .TRUE.
          ICOMPQ = 3
       ELSE
+         ILQ = .TRUE.
          ICOMPQ = 0
       END IF
 *
@@ -377,6 +380,7 @@
          ILZ = .TRUE.
          ICOMPZ = 3
       ELSE
+         ILZ = .TRUE.
          ICOMPZ = 0
       END IF
 *
@@ -730,22 +734,34 @@
             AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
      $             ( BSCALE*T( ILAST, ILAST ) )
             ABI22 = AD22 - U12*AD21
+            ABI12 = AD12 - U12*AD11
 *
-            T1 = HALF*( AD11+ABI22 )
-            RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
-            TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) +
-     $             DIMAG( T1-ABI22 )*DIMAG( RTDISC )
-            IF( TEMP.LE.ZERO ) THEN
-               SHIFT = T1 + RTDISC
-            ELSE
-               SHIFT = T1 - RTDISC
+            SHIFT = ABI22
+            CTEMP = SQRT( ABI12 )*SQRT( AD21 )
+            TEMP = ABS1( CTEMP )
+            IF( CTEMP.NE.ZERO ) THEN
+               X = HALF*( AD11-SHIFT )
+               TEMP2 = ABS1( X )
+               TEMP = MAX( TEMP, ABS1( X ) )
+               Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
+               IF( TEMP2.GT.ZERO ) THEN
+                  IF( DBLE( X / TEMP2 )*DBLE( Y )+
+     $                DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y
+               END IF
+               SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) )
             END IF
          ELSE
 *
 *           Exceptional shift.  Chosen for no particularly good reason.
 *
-            ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/
-     $                        (BSCALE*T(ILAST-1,ILAST-1))
+            IF( ( IITER / 20 )*20.EQ.IITER .AND. 
+     $         BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN
+               ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
+     $            ILAST ) )/( BSCALE*T( ILAST, ILAST ) )
+            ELSE
+               ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
+     $            ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) )
+            END IF
             SHIFT = ESHIFT
          END IF
 *
diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f
index 2ee874dfd..e0fddd3a7 100644
--- a/lapack-netlib/SRC/zhseqr.f
+++ b/lapack-netlib/SRC/zhseqr.f
@@ -320,10 +320,10 @@
 *     .    ZLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare ZLAHQR failure.  NL > NTINY = 11 is
+*     .    through a rare ZLAHQR failure.  NL > NTINY = 15 is
 *     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f
index feffe9782..edf01bc7c 100644
--- a/lapack-netlib/SRC/zlaqr0.f
+++ b/lapack-netlib/SRC/zlaqr0.f
@@ -262,7 +262,7 @@
 *     .    ZLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -357,22 +357,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -420,7 +420,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -560,7 +560,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use ZLAQR4 or
 *                 .    ZLAHQR on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -661,7 +661,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f
index a88f6508e..6d083fcda 100644
--- a/lapack-netlib/SRC/zlaqr4.f
+++ b/lapack-netlib/SRC/zlaqr4.f
@@ -268,7 +268,7 @@
 *     .    ZLAHQR because of insufficient subdiagonal scratch space.
 *     .    (This is a hard limit.) ====
       INTEGER            NTINY
-      PARAMETER          ( NTINY = 11 )
+      PARAMETER          ( NTINY = 15 )
 *
 *     ==== Exceptional deflation windows:  try to cure rare
 *     .    slow convergence by varying the size of the
@@ -363,22 +363,22 @@
          END IF
 *
 *        ==== NWR = recommended deflation window size.  At this
-*        .    point,  N .GT. NTINY = 11, so there is enough
+*        .    point,  N .GT. NTINY = 15, so there is enough
 *        .    subdiagonal workspace for NWR.GE.2 as required.
 *        .    (In fact, there is enough subdiagonal space for
-*        .    NWR.GE.3.) ====
+*        .    NWR.GE.4.) ====
 *
          NWR = ILAENV( 13, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
          NWR = MAX( 2, NWR )
          NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR )
 *
 *        ==== NSR = recommended number of simultaneous shifts.
-*        .    At this point N .GT. NTINY = 11, so there is at
+*        .    At this point N .GT. NTINY = 15, so there is at
 *        .    enough subdiagonal workspace for NSR to be even
 *        .    and greater than or equal to two as required. ====
 *
          NSR = ILAENV( 15, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK )
-         NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO )
+         NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO )
          NSR = MAX( 2, NSR-MOD( NSR, 2 ) )
 *
 *        ==== Estimate optimal workspace ====
@@ -426,7 +426,7 @@
 *        ==== NSMAX = the Largest number of simultaneous shifts
 *        .    for which there is sufficient workspace. ====
 *
-         NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 )
+         NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 )
          NSMAX = NSMAX - MOD( NSMAX, 2 )
 *
 *        ==== NDFL: an iteration count restarted at deflation. ====
@@ -566,7 +566,7 @@
 *
 *                 ==== Got NS/2 or fewer shifts? Use ZLAHQR
 *                 .    on a trailing principal submatrix to
-*                 .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
+*                 .    get more. (Since NS.LE.NSMAX.LE.(N-3)/6,
 *                 .    there is enough space below the subdiagonal
 *                 .    to fit an NS-by-NS scratch array.) ====
 *
@@ -661,7 +661,7 @@
 *              .      (NVE-by-KDU) vertical work WV arrow along
 *              .      the left-hand-edge. ====
 *
-               KDU = 3*NS - 3
+               KDU = 2*NS
                KU = N - KDU + 1
                KWH = KDU + 1
                NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1
diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f
index 9ff7e7eca..c12f4b780 100644
--- a/lapack-netlib/SRC/zlaqr5.f
+++ b/lapack-netlib/SRC/zlaqr5.f
@@ -69,10 +69,9 @@
 *>             matrix entries.
 *>        = 1: ZLAQR5 accumulates reflections and uses matrix-matrix
 *>             multiply to update the far-from-diagonal matrix entries.
-*>        = 2: ZLAQR5 accumulates reflections, uses matrix-matrix
-*>             multiply to update the far-from-diagonal matrix entries,
-*>             and takes advantage of 2-by-2 block structure during
-*>             matrix multiplies.
+*>        = 2: Same as KACC22 = 1. This option used to enable exploiting
+*>             the 2-by-2 structure during matrix multiplications, but
+*>             this is no longer supported.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -170,14 +169,14 @@
 *>
 *> \param[out] U
 *> \verbatim
-*>          U is COMPLEX*16 array, dimension (LDU,3*NSHFTS-3)
+*>          U is COMPLEX*16 array, dimension (LDU,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDU
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 2*NSHFTS.
 *> \endverbatim
 *>
 *> \param[in] NV
@@ -189,7 +188,7 @@
 *>
 *> \param[out] WV
 *> \verbatim
-*>          WV is COMPLEX*16 array, dimension (LDWV,3*NSHFTS-3)
+*>          WV is COMPLEX*16 array, dimension (LDWV,2*NSHFTS)
 *> \endverbatim
 *>
 *> \param[in] LDWV
@@ -215,7 +214,7 @@
 *> \verbatim
 *>          LDWH is INTEGER
 *>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*>             calling procedure.  LDWH >= 2*NSHFTS.
 *> \endverbatim
 *>
 *  Authors:
@@ -226,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2016
+*> \date January 2021
 *
 *> \ingroup complex16OTHERauxiliary
 *
@@ -235,6 +234,11 @@
 *>
 *>       Karen Braman and Ralph Byers, Department of Mathematics,
 *>       University of Kansas, USA
+*>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang
+*>
+*>       Thijs Steel, Department of Computer science,
+*>       KU Leuven, Belgium
 *
 *> \par References:
 *  ================
@@ -244,10 +248,15 @@
 *>       Performance, SIAM Journal of Matrix Analysis, volume 23, pages
 *>       929--947, 2002.
 *>
+*>       Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed
+*>       chains of bulges in multishift QR algorithms.
+*>       ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014).
+*>
 *  =====================================================================
       SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S,
      $                   H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV,
      $                   WV, LDWV, NH, WH, LDWH )
+      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine (version 3.7.1) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -276,11 +285,11 @@
       COMPLEX*16         ALPHA, BETA, CDUM, REFSUM
       DOUBLE PRECISION   H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
      $                   SMLNUM, TST1, TST2, ULP
-      INTEGER            I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN,
-     $                   JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS,
-     $                   M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL,
+      INTEGER            I2, I4, INCOL, J, JBOT, JCOL, JLEN,
+     $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
+     $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
      $                   NS, NU
-      LOGICAL            ACCUM, BLK22, BMP22
+      LOGICAL            ACCUM, BMP22
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -334,10 +343,6 @@
 *
       ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 )
 *
-*     ==== If so, exploit the 2-by-2 block structure? ====
-*
-      BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 )
-*
 *     ==== clear trash ====
 *
       IF( KTOP+2.LE.KBOT )
@@ -349,28 +354,39 @@
 *
 *     ==== KDU = width of slab ====
 *
-      KDU = 6*NBMPS - 3
+      KDU = 4*NBMPS
 *
 *     ==== Create and chase chains of NBMPS bulges ====
 *
-      DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2
+      DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS
+*
+*        JTOP = Index from which updates from the right start.
+*
+         IF( ACCUM ) THEN
+            JTOP = MAX( KTOP, INCOL )
+         ELSE IF( WANTT ) THEN
+            JTOP = 1
+         ELSE
+            JTOP = KTOP
+         END IF
+*
          NDCOL = INCOL + KDU
          IF( ACCUM )
      $      CALL ZLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU )
 *
 *        ==== Near-the-diagonal bulge chase.  The following loop
 *        .    performs the near-the-diagonal part of a small bulge
-*        .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
+*        .    multi-shift QR sweep.  Each 4*NBMPS column diagonal
 *        .    chunk extends from column INCOL to column NDCOL
 *        .    (including both column INCOL and column NDCOL). The
-*        .    following loop chases a 3*NBMPS column long chain of
-*        .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
+*        .    following loop chases a 2*NBMPS+1 column long chain of
+*        .    NBMPS bulges 2*NBMPS columns to the right.  (INCOL
 *        .    may be less than KTOP and and NDCOL may be greater than
 *        .    KBOT indicating phantom columns from which to chase
 *        .    bulges before they are actually introduced or to which
 *        .    to chase bulges beyond column KBOT.)  ====
 *
-         DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 )
+         DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 )
 *
 *           ==== Bulges number MTOP to MBOT are active double implicit
 *           .    shift bulges.  There may or may not also be small
@@ -379,24 +395,156 @@
 *           .    down the diagonal to make room.  The phantom matrix
 *           .    paradigm described above helps keep track.  ====
 *
-            MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 )
-            MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 )
+            MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 )
+            MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 )
             M22 = MBOT + 1
-            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ.
+            BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ.
      $              ( KBOT-2 )
 *
 *           ==== Generate reflections to chase the chain right
 *           .    one column.  (The minimum value of K is KTOP-1.) ====
 *
-            DO 10 M = MTOP, MBOT
-               K = KRCOL + 3*( M-1 )
+            IF ( BMP22 ) THEN
+*
+*              ==== Special case: 2-by-2 reflection at bottom treated
+*              .    separately ====
+*
+               K = KRCOL + 2*( M22-1 )
+               IF( K.EQ.KTOP-1 ) THEN
+                  CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
+     $                         S( 2*M22 ), V( 1, M22 ) )
+                  BETA = V( 1, M22 )
+                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+               ELSE
+                  BETA = H( K+1, K )
+                  V( 2, M22 ) = H( K+2, K )
+                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
+                  H( K+1, K ) = BETA
+                  H( K+2, K ) = ZERO
+               END IF
+
+*
+*              ==== Perform update from right within 
+*              .    computational window. ====
+*
+               DO 30 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
+     $                     H( J, K+2 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*DCONJG( V( 2, M22 ) )
+   30          CONTINUE
+*
+*              ==== Perform update from left within 
+*              .    computational window. ====
+*
+               IF( ACCUM ) THEN
+                  JBOT = MIN( NDCOL, KBOT )
+               ELSE IF( WANTT ) THEN
+                  JBOT = N
+               ELSE
+                  JBOT = KBOT
+               END IF
+               DO 40 J = K+1, JBOT
+                  REFSUM = DCONJG( V( 1, M22 ) )*
+     $                     ( H( K+1, J )+DCONJG( V( 2, M22 ) )*
+     $                     H( K+2, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+   40          CONTINUE
+*
+*              ==== The following convergence test requires that
+*              .    the tradition small-compared-to-nearby-diagonals
+*              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
+*              .    criteria both be satisfied.  The latter improves
+*              .    accuracy in some examples. Falling back on an
+*              .    alternate convergence criterion when TST1 or TST2
+*              .    is zero (as done here) is traditional but probably
+*              .    unnecessary. ====
+*
+               IF( K.GE.KTOP ) THEN
+                  IF( H( K+1, K ).NE.ZERO ) THEN
+                     TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
+                     IF( TST1.EQ.RZERO ) THEN
+                        IF( K.GE.KTOP+1 )
+     $                     TST1 = TST1 + CABS1( H( K, K-1 ) )
+                        IF( K.GE.KTOP+2 )
+     $                     TST1 = TST1 + CABS1( H( K, K-2 ) )
+                        IF( K.GE.KTOP+3 )
+     $                     TST1 = TST1 + CABS1( H( K, K-3 ) )
+                        IF( K.LE.KBOT-2 )
+     $                     TST1 = TST1 + CABS1( H( K+2, K+1 ) )
+                        IF( K.LE.KBOT-3 )
+     $                     TST1 = TST1 + CABS1( H( K+3, K+1 ) )
+                        IF( K.LE.KBOT-4 )
+     $                     TST1 = TST1 + CABS1( H( K+4, K+1 ) )
+                     END IF
+                     IF( CABS1( H( K+1, K ) )
+     $                   .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN
+                        H12 = MAX( CABS1( H( K+1, K ) ),
+     $                     CABS1( H( K, K+1 ) ) )
+                        H21 = MIN( CABS1( H( K+1, K ) ),
+     $                     CABS1( H( K, K+1 ) ) )
+                        H11 = MAX( CABS1( H( K+1, K+1 ) ),
+     $                     CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        H22 = MIN( CABS1( H( K+1, K+1 ) ),
+     $                     CABS1( H( K, K )-H( K+1, K+1 ) ) )
+                        SCL = H11 + H12
+                        TST2 = H22*( H11 / SCL )
+*
+                        IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE.
+     $                      MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
+                     END IF
+                  END IF
+               END IF
+*
+*              ==== Accumulate orthogonal transformations. ====
+*
+               IF( ACCUM ) THEN
+                  KMS = K - INCOL
+                  DO 50 J = MAX( 1, KTOP-INCOL ), KDU
+                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
+     $                        V( 2, M22 )*U( J, KMS+2 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*DCONJG( V( 2, M22 ) )
+  50                 CONTINUE
+               ELSE IF( WANTZ ) THEN
+                  DO 60 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
+     $                        Z( J, K+2 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*DCONJG( V( 2, M22 ) )
+  60              CONTINUE
+               END IF
+            END IF
+*
+*           ==== Normal case: Chain of 3-by-3 reflections ====
+*
+            DO 80 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
                IF( K.EQ.KTOP-1 ) THEN
                   CALL ZLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ),
      $                         S( 2*M ), V( 1, M ) )
                   ALPHA = V( 1, M )
                   CALL ZLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) )
                ELSE
-                  BETA = H( K+1, K )
+*
+*                 ==== Perform delayed transformation of row below
+*                 .    Mth bulge. Exploit fact that first two elements
+*                 .    of row are actually zero. ====
+*
+                  REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 )
+                  H( K+3, K   ) = -REFSUM
+                  H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) )
+                  H( K+3, K+2 ) = H( K+3, K+2 ) -
+     $                            REFSUM*DCONJG( V( 3, M ) )
+*
+*                 ==== Calculate reflection to move
+*                 .    Mth bulge one step. ====
+*
+                  BETA      = H( K+1, K )
                   V( 2, M ) = H( K+2, K )
                   V( 3, M ) = H( K+3, K )
                   CALL ZLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) )
@@ -444,7 +592,7 @@
                         H( K+3, K ) = ZERO
                      ELSE
 *
-*                       ==== Stating a new bulge here would
+*                       ==== Starting a new bulge here would
 *                       .    create only negligible fill.
 *                       .    Replace the old reflector with
 *                       .    the new one. ====
@@ -458,163 +606,32 @@
                      END IF
                   END IF
                END IF
-   10       CONTINUE
-*
-*           ==== Generate a 2-by-2 reflection, if needed. ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF( K.EQ.KTOP-1 ) THEN
-                  CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ),
-     $                         S( 2*M22 ), V( 1, M22 ) )
-                  BETA = V( 1, M22 )
-                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-               ELSE
-                  BETA = H( K+1, K )
-                  V( 2, M22 ) = H( K+2, K )
-                  CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) )
-                  H( K+1, K ) = BETA
-                  H( K+2, K ) = ZERO
-               END IF
-            END IF
 *
-*           ==== Multiply H by reflections from the left ====
-*
-            IF( ACCUM ) THEN
-               JBOT = MIN( NDCOL, KBOT )
-            ELSE IF( WANTT ) THEN
-               JBOT = N
-            ELSE
-               JBOT = KBOT
-            END IF
-            DO 30 J = MAX( KTOP, KRCOL ), JBOT
-               MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 )
-               DO 20 M = MTOP, MEND
-                  K = KRCOL + 3*( M-1 )
-                  REFSUM = DCONJG( V( 1, M ) )*
-     $                     ( H( K+1, J )+DCONJG( V( 2, M ) )*
-     $                     H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
-   20          CONTINUE
-   30       CONTINUE
-            IF( BMP22 ) THEN
-               K = KRCOL + 3*( M22-1 )
-               DO 40 J = MAX( K+1, KTOP ), JBOT
-                  REFSUM = DCONJG( V( 1, M22 ) )*
-     $                     ( H( K+1, J )+DCONJG( V( 2, M22 ) )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
-   40          CONTINUE
-            END IF
-*
-*           ==== Multiply H by reflections from the right.
-*           .    Delay filling in the last row until the
-*           .    vigilant deflation check is complete. ====
-*
-            IF( ACCUM ) THEN
-               JTOP = MAX( KTOP, INCOL )
-            ELSE IF( WANTT ) THEN
-               JTOP = 1
-            ELSE
-               JTOP = KTOP
-            END IF
-            DO 80 M = MTOP, MBOT
-               IF( V( 1, M ).NE.ZERO ) THEN
-                  K = KRCOL + 3*( M-1 )
-                  DO 50 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*DCONJG( V( 2, M ) )
-                     H( J, K+3 ) = H( J, K+3 ) -
-     $                             REFSUM*DCONJG( V( 3, M ) )
-   50             CONTINUE
-*
-                  IF( ACCUM ) THEN
-*
-*                    ==== Accumulate U. (If necessary, update Z later
-*                    .    with with an efficient matrix-matrix
-*                    .    multiply.) ====
-*
-                     KMS = K - INCOL
-                     DO 60 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                           U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*DCONJG( V( 2, M ) )
-                        U( J, KMS+3 ) = U( J, KMS+3 ) -
-     $                                  REFSUM*DCONJG( V( 3, M ) )
-   60                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-*
-*                    ==== U is not accumulated, so update Z
-*                    .    now by multiplying by reflections
-*                    .    from the right. ====
-*
-                     DO 70 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                           Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*DCONJG( V( 2, M ) )
-                        Z( J, K+3 ) = Z( J, K+3 ) -
-     $                                REFSUM*DCONJG( V( 3, M ) )
-   70                CONTINUE
-                  END IF
-               END IF
-   80       CONTINUE
-*
-*           ==== Special case: 2-by-2 reflection (if needed) ====
-*
-            K = KRCOL + 3*( M22-1 )
-            IF( BMP22 ) THEN
-               IF ( V( 1, M22 ).NE.ZERO ) THEN
-                  DO 90 J = JTOP, MIN( KBOT, K+3 )
-                     REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                        H( J, K+2 ) )
-                     H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                     H( J, K+2 ) = H( J, K+2 ) -
-     $                             REFSUM*DCONJG( V( 2, M22 ) )
-   90             CONTINUE
-*
-                  IF( ACCUM ) THEN
-                     KMS = K - INCOL
-                     DO 100 J = MAX( 1, KTOP-INCOL ), KDU
-                        REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                           V( 2, M22 )*U( J, KMS+2 ) )
-                        U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                        U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                                  REFSUM*DCONJG( V( 2, M22 ) )
-  100                CONTINUE
-                  ELSE IF( WANTZ ) THEN
-                     DO 110 J = ILOZ, IHIZ
-                        REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                           Z( J, K+2 ) )
-                        Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                        Z( J, K+2 ) = Z( J, K+2 ) -
-     $                                REFSUM*DCONJG( V( 2, M22 ) )
-  110                CONTINUE
-                  END IF
-               END IF
-            END IF
-*
-*           ==== Vigilant deflation check ====
-*
-            MSTART = MTOP
-            IF( KRCOL+3*( MSTART-1 ).LT.KTOP )
-     $         MSTART = MSTART + 1
-            MEND = MBOT
-            IF( BMP22 )
-     $         MEND = MEND + 1
-            IF( KRCOL.EQ.KBOT-2 )
-     $         MEND = MEND + 1
-            DO 120 M = MSTART, MEND
-               K = MIN( KBOT-1, KRCOL+3*( M-1 ) )
+*              ====  Apply reflection from the right and
+*              .     the first column of update from the left.
+*              .     These updates are required for the vigilant
+*              .     deflation check. We still delay most of the
+*              .     updates from the left for efficiency. ====
+*
+               DO 70 J = JTOP, MIN( KBOT, K+3 )
+                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
+     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
+                  H( J, K+2 ) = H( J, K+2 ) -
+     $                          REFSUM*DCONJG( V( 2, M ) )
+                  H( J, K+3 ) = H( J, K+3 ) -
+     $                          REFSUM*DCONJG( V( 3, M ) )
+   70          CONTINUE
+*
+*              ==== Perform update from left for subsequent
+*              .    column. ====
+*
+               REFSUM =  DCONJG( V( 1, M ) )*( H( K+1, K+1 )
+     $                  +DCONJG( V( 2, M ) )*H( K+2, K+1 )
+     $                  +DCONJG( V( 3, M ) )*H( K+3, K+1 ) )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -625,6 +642,8 @@
 *              .    is zero (as done here) is traditional but probably
 *              .    unnecessary. ====
 *
+               IF( K.LT.KTOP)
+     $              CYCLE
                IF( H( K+1, K ).NE.ZERO ) THEN
                   TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) )
                   IF( TST1.EQ.RZERO ) THEN
@@ -658,23 +677,77 @@
      $                   MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO
                   END IF
                END IF
-  120       CONTINUE
+   80       CONTINUE
+*
+*           ==== Multiply H by reflections from the left ====
+*
+            IF( ACCUM ) THEN
+               JBOT = MIN( NDCOL, KBOT )
+            ELSE IF( WANTT ) THEN
+               JBOT = N
+            ELSE
+               JBOT = KBOT
+            END IF
 *
-*           ==== Fill in the last row of each bulge. ====
+            DO 100 M = MBOT, MTOP, -1
+               K = KRCOL + 2*( M-1 )
+               DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
+                  REFSUM = DCONJG( V( 1, M ) )*
+     $                     ( H( K+1, J )+DCONJG( V( 2, M ) )*
+     $                     H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+   90          CONTINUE
+  100       CONTINUE
 *
-            MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 )
-            DO 130 M = MTOP, MEND
-               K = KRCOL + 3*( M-1 )
-               REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 )
-               H( K+4, K+1 ) = -REFSUM
-               H( K+4, K+2 ) = -REFSUM*DCONJG( V( 2, M ) )
-               H( K+4, K+3 ) = H( K+4, K+3 ) -
-     $                         REFSUM*DCONJG( V( 3, M ) )
-  130       CONTINUE
+*           ==== Accumulate orthogonal transformations. ====
+*
+            IF( ACCUM ) THEN
+*
+*              ==== Accumulate U. (If needed, update Z later
+*              .    with an efficient matrix-matrix
+*              .    multiply.) ====
+*
+               DO 120 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  KMS = K - INCOL
+                  I2 = MAX( 1, KTOP-INCOL )
+                  I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
+                  I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  DO 110 J = I2, I4
+                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
+     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
+                     U( J, KMS+2 ) = U( J, KMS+2 ) -
+     $                               REFSUM*DCONJG( V( 2, M ) )
+                     U( J, KMS+3 ) = U( J, KMS+3 ) -
+     $                               REFSUM*DCONJG( V( 3, M ) )
+  110             CONTINUE
+  120          CONTINUE
+            ELSE IF( WANTZ ) THEN
+*
+*              ==== U is not accumulated, so update Z
+*              .    now by multiplying by reflections
+*              .    from the right. ====
+*
+               DO 140 M = MBOT, MTOP, -1
+                  K = KRCOL + 2*( M-1 )
+                  DO 130 J = ILOZ, IHIZ
+                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
+     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
+                     Z( J, K+2 ) = Z( J, K+2 ) -
+     $                             REFSUM*DCONJG( V( 2, M ) )
+                     Z( J, K+3 ) = Z( J, K+3 ) -
+     $                             REFSUM*DCONJG( V( 3, M ) )
+  130             CONTINUE
+  140          CONTINUE
+            END IF
 *
 *           ==== End of near-the-diagonal bulge chase. ====
 *
-  140    CONTINUE
+  145    CONTINUE
 *
 *        ==== Use U (if accumulated) to update far-from-diagonal
 *        .    entries in H.  If required, use U to update Z as
@@ -688,220 +761,45 @@
                JTOP = KTOP
                JBOT = KBOT
             END IF
-            IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR.
-     $          ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN
-*
-*              ==== Updates not exploiting the 2-by-2 block
-*              .    structure of U.  K1 and NU keep track of
-*              .    the location and size of U in the special
-*              .    cases of introducing bulges and chasing
-*              .    bulges off the bottom.  In these special
-*              .    cases and in case the number of shifts
-*              .    is NS = 2, there is no 2-by-2 block
-*              .    structure to exploit.  ====
-*
-               K1 = MAX( 1, KTOP-INCOL )
-               NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
-*
-*              ==== Horizontal Multiply ====
-*
-               DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-                  CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
-     $                        LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
-     $                        LDWH )
-                  CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH,
-     $                         H( INCOL+K1, JCOL ), LDH )
-  150          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
-                  JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+            K1 = MAX( 1, KTOP-INCOL )
+            NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1
+*
+*           ==== Horizontal Multiply ====
+*
+            DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
+               JLEN = MIN( NH, JBOT-JCOL+1 )
+               CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ),
+     $                     LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH,
+     $                     LDWH )
+               CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH,
+     $                      H( INCOL+K1, JCOL ), LDH )
+  150       CONTINUE
+*
+*           ==== Vertical multiply ====
+*
+            DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV
+               JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW )
+               CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE,
+     $                     H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                     LDU, ZERO, WV, LDWV )
+               CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV,
+     $                      H( JROW, INCOL+K1 ), LDH )
+  160       CONTINUE
+*
+*           ==== Z multiply (also vertical) ====
+*
+            IF( WANTZ ) THEN
+               DO 170 JROW = ILOZ, IHIZ, NV
+                  JLEN = MIN( NV, IHIZ-JROW+1 )
                   CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                        H( JROW, INCOL+K1 ), LDH, U( K1, K1 ),
+     $                        Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
      $                        LDU, ZERO, WV, LDWV )
                   CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                         H( JROW, INCOL+K1 ), LDH )
-  160          CONTINUE
-*
-*              ==== Z multiply (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 170 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-                     CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE,
-     $                           Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ),
-     $                           LDU, ZERO, WV, LDWV )
-                     CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV,
-     $                            Z( JROW, INCOL+K1 ), LDZ )
-  170             CONTINUE
-               END IF
-            ELSE
-*
-*              ==== Updates exploiting U's 2-by-2 block structure.
-*              .    (I2, I4, J2, J4 are the last rows and columns
-*              .    of the blocks.) ====
-*
-               I2 = ( KDU+1 ) / 2
-               I4 = KDU
-               J2 = I4 - I2
-               J4 = KDU
-*
-*              ==== KZS and KNZ deal with the band of zeros
-*              .    along the diagonal of one of the triangular
-*              .    blocks. ====
-*
-               KZS = ( J4-J2 ) - ( NS+1 )
-               KNZ = NS + 1
-*
-*              ==== Horizontal multiply ====
-*
-               DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH
-                  JLEN = MIN( NH, JBOT-JCOL+1 )
-*
-*                 ==== Copy bottom of H to top+KZS of scratch ====
-*                  (The first KZS rows get multiplied by zero.) ====
-*
-                  CALL ZLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ),
-     $                         LDH, WH( KZS+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL ZLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH )
-                  CALL ZTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ),
-     $                        LDWH )
-*
-*                 ==== Multiply top of H by U11**H ====
-*
-                  CALL ZGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU,
-     $                        H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH )
-*
-*                 ==== Copy top of H to bottom of WH ====
-*
-                  CALL ZLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH,
-     $                         WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U21**H ====
-*
-                  CALL ZTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE,
-     $                        U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL ZGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE,
-     $                        U( J2+1, I2+1 ), LDU,
-     $                        H( INCOL+1+J2, JCOL ), LDH, ONE,
-     $                        WH( I2+1, 1 ), LDWH )
-*
-*                 ==== Copy it back ====
-*
-                  CALL ZLACPY( 'ALL', KDU, JLEN, WH, LDWH,
-     $                         H( INCOL+1, JCOL ), LDH )
-  180          CONTINUE
-*
-*              ==== Vertical multiply ====
-*
-               DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV
-                  JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW )
-*
-*                 ==== Copy right of H to scratch (the first KZS
-*                 .    columns get multiplied by zero) ====
-*
-                  CALL ZLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ),
-     $                         LDH, WV( 1, 1+KZS ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV )
-                  CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                        U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                        LDWV )
-*
-*                 ==== Multiply by U11 ====
-*
-                  CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                        H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV,
-     $                        LDWV )
-*
-*                 ==== Copy left of H to right of scratch ====
-*
-                  CALL ZLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH,
-     $                         WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U21 ====
-*
-                  CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                        U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV )
-*
-*                 ==== Multiply by U22 ====
-*
-                  CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                        H( JROW, INCOL+1+J2 ), LDH,
-     $                        U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ),
-     $                        LDWV )
-*
-*                 ==== Copy it back ====
-*
-                  CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                         H( JROW, INCOL+1 ), LDH )
-  190          CONTINUE
-*
-*              ==== Multiply Z (also vertical) ====
-*
-               IF( WANTZ ) THEN
-                  DO 200 JROW = ILOZ, IHIZ, NV
-                     JLEN = MIN( NV, IHIZ-JROW+1 )
-*
-*                    ==== Copy right of Z to left of scratch (first
-*                    .     KZS columns get multiplied by zero) ====
-*
-                     CALL ZLACPY( 'ALL', JLEN, KNZ,
-     $                            Z( JROW, INCOL+1+J2 ), LDZ,
-     $                            WV( 1, 1+KZS ), LDWV )
-*
-*                    ==== Multiply by U12 ====
-*
-                     CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV,
-     $                            LDWV )
-                     CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE,
-     $                           U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U11 ====
-*
-                     CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE,
-     $                           Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE,
-     $                           WV, LDWV )
-*
-*                    ==== Copy left of Z to right of scratch ====
-*
-                     CALL ZLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ),
-     $                            LDZ, WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Multiply by U21 ====
-*
-                     CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE,
-     $                           U( 1, I2+1 ), LDU, WV( 1, 1+I2 ),
-     $                           LDWV )
-*
-*                    ==== Multiply by U22 ====
-*
-                     CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE,
-     $                           Z( JROW, INCOL+1+J2 ), LDZ,
-     $                           U( J2+1, I2+1 ), LDU, ONE,
-     $                           WV( 1, 1+I2 ), LDWV )
-*
-*                    ==== Copy the result back to Z ====
-*
-                     CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV,
-     $                            Z( JROW, INCOL+1 ), LDZ )
-  200             CONTINUE
-               END IF
+     $                         Z( JROW, INCOL+K1 ), LDZ )
+  170          CONTINUE
             END IF
          END IF
-  210 CONTINUE
+  180 CONTINUE
 *
 *     ==== End of ZLAQR5 ====
 *
diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f
new file mode 100644
index 000000000..4a3c4dcf1
--- /dev/null
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@@ -0,0 +1,597 @@
+*> \brief \b ZLARFB_GETT
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLARFB_GETT + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarfb_gett.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarfb_gett.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarfb_gett.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+*      $                        WORK, LDWORK )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          IDENT
+*       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
+*      $                   WORK( LDWORK, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLARFB_GETT applies a complex Householder block reflector H from the
+*> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+*> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+*> in the array B. The block reflector H is stored in a compact
+*> WY-representation, where the elementary reflectors are in the
+*> arrays A, B and T. See Further Details section.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] IDENT
+*> \verbatim
+*>          IDENT is CHARACTER*1
+*>          If IDENT = not 'I', or not 'i', then V1 is unit
+*>             lower-triangular and stored in the left K-by-K block of
+*>             the input matrix A,
+*>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
+*>             not stored.
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix B.
+*>          M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrices A and B.
+*>          N >= 0.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*>          K is INTEGER
+*>          The number or rows of the matrix A.
+*>          K is also order of the matrix T, i.e. the number of
+*>          elementary reflectors whose product defines the block
+*>          reflector. 0 <= K <= N.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array, dimension (LDT,K)
+*>          The upper-triangular K-by-K matrix T in the representation
+*>          of the block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry:
+*>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
+*>           b) In the columns below the diagonal: columns of V1
+*>              (ones are not stored on the diagonal).
+*>
+*>          On exit:
+*>            A is overwritten by rectangular K-by-N product H*A.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,K).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,N)
+*>
+*>          On entry:
+*>            a) In the M-by-(N-K) right block: input matrix B.
+*>            b) In the M-by-N left block: columns of V2.
+*>
+*>          On exit:
+*>            B is overwritten by rectangular M-by-N product H*B.
+*>
+*>          See Further Details section.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array,
+*>          dimension (LDWORK,max(K,N-K))
+*> \endverbatim
+*>
+*> \param[in] LDWORK
+*> \verbatim
+*>          LDWORK is INTEGER
+*>          The leading dimension of the array WORK. LDWORK>=max(1,K).
+*>
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>    (1) Description of the Algebraic Operation.
+*>
+*>    The matrix A is a K-by-N matrix composed of two column block
+*>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
+*>    A = ( A1, A2 ).
+*>    The matrix B is an M-by-N matrix composed of two column block
+*>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
+*>    B = ( B1, B2 ).
+*>
+*>    Perform the operation:
+*>
+*>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
+*>       ( B_out )        ( B_in )                          ( B_in )
+*>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
+*>                          ( V2 )                            ( B_in )
+*>     On input:
+*>
+*>    a) ( A_in )  consists of two block columns:
+*>       ( B_in )
+*>
+*>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
+*>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
+*>
+*>       where the column blocks are:
+*>
+*>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
+*>                   upper triangular part of the array A(1:K,1:K).
+*>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
+*>
+*>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
+*>                  in the array A(1:K,K+1:N).
+*>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
+*>                  in the array B(1:M,K+1:N).
+*>
+*>    b) V = ( V1 )
+*>           ( V2 )
+*>
+*>       where:
+*>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
+*>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
+*>          stored in the lower-triangular part of the array
+*>          A(1:K,1:K) (ones are not stored),
+*>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
+*>                 (because on input B1_in is a rectangular zero
+*>                  matrix that is not stored and the space is
+*>                  used to store V2).
+*>
+*>    c) T is a K-by-K upper-triangular matrix stored
+*>       in the array T(1:K,1:K).
+*>
+*>    On output:
+*>
+*>    a) ( A_out ) consists of two  block columns:
+*>       ( B_out )
+*>
+*>       ( A_out ) = (( A1_out ) ( A2_out ))
+*>       ( B_out )   (( B1_out ) ( B2_out )),
+*>
+*>       where the column blocks are:
+*>
+*>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
+*>                   upper-triangular matrix, if V1 is an
+*>                   identity matrix. AiOut is stored in
+*>                   the array A(1:K,1:K).
+*>       ( B1_out )  is an M-by-K rectangular matrix stored
+*>                   in the array B(1:M,K:N).
+*>
+*>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
+*>                   in the array A(1:K,K+1:N).
+*>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
+*>                   in the array B(1:M,K+1:N).
+*>
+*>
+*>    The operation above can be represented as the same operation
+*>    on each block column:
+*>
+*>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
+*>       ( B1_out )        (     0 )                          (     0 )
+*>
+*>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
+*>       ( B2_out )        ( B2_in )                          ( B2_in )
+*>
+*>    If IDENT != 'I':
+*>
+*>       The computation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
+*>
+*>       B1_out: = - V2*T*(V1**H)*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
+*>
+*>    If IDENT == 'I':
+*>
+*>       The operation for column block 1:
+*>
+*>       A1_out: = A1_in - V1*T*A1_in
+*>
+*>       B1_out: = - V2*T*A1_in
+*>
+*>       The computation for column block 2, which exists if N > K:
+*>
+*>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
+*>
+*>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
+*>
+*>    (2) Description of the Algorithmic Computation.
+*>
+*>    In the first step, we compute column block 2, i.e. A2 and B2.
+*>    Here, we need to use the K-by-(N-K) rectangular workspace
+*>    matrix W2 that is of the same size as the matrix A2.
+*>    W2 is stored in the array WORK(1:K,1:(N-K)).
+*>
+*>    In the second step, we compute column block 1, i.e. A1 and B1.
+*>    Here, we need to use the K-by-K square workspace matrix W1
+*>    that is of the same size as the as the matrix A1.
+*>    W1 is stored in the array WORK(1:K,1:K).
+*>
+*>    NOTE: Hence, in this routine, we need the workspace array WORK
+*>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
+*>    the first step and W1 from the second step.
+*>
+*>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
+*>    more computations than in the Case (B).
+*>
+*>    if( IDENT != 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>       col1_(6) square A1: = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
+*>    less computations than in the Case (A)
+*>
+*>    if( IDENT == 'I' ) then
+*>     if ( N > K ) then
+*>       (First Step - column block 2)
+*>       col2_(1) W2: = A2
+*>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*>       col2_(4) W2: = T * W2
+*>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>       col2_(7) A2: = A2 - W2
+*>     else
+*>       (Second Step - column block 1)
+*>       col1_(1) W1: = A1
+*>       col1_(3) W1: = T * W1
+*>       col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>       col1_(6) upper-triangular_of_(A1): = A1 - W1
+*>     end if
+*>    end if
+*>
+*>    Combine these cases (A) and (B) together, this is the resulting
+*>    algorithm:
+*>
+*>    if ( N > K ) then
+*>
+*>      (First Step - column block 2)
+*>
+*>      col2_(1)  W2: = A2
+*>      if( IDENT != 'I' ) then
+*>        col2_(2)  W2: = (V1**H) * W2
+*>                      = (unit_lower_tr_of_(A1)**H) * W2
+*>      end if
+*>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
+*>      col2_(4)  W2: = T * W2
+*>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
+*>      if( IDENT != 'I' ) then
+*>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
+*>      end if
+*>      col2_(7) A2: = A2 - W2
+*>
+*>    else
+*>
+*>    (Second Step - column block 1)
+*>
+*>      col1_(1) W1: = A1
+*>      if( IDENT != 'I' ) then
+*>        col1_(2) W1: = (V1**H) * W1
+*>                    = (unit_lower_tr_of_(A1)**H) * W1
+*>      end if
+*>      col1_(3) W1: = T * W1
+*>      col1_(4) B1: = - V2 * W1 = - B1 * W1
+*>      if( IDENT != 'I' ) then
+*>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
+*>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
+*>      end if
+*>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
+*>
+*>    end if
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
+     $                        WORK, LDWORK )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          IDENT
+      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
+     $                   WORK( LDWORK, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LNOTIDENT
+      INTEGER            I, J
+*     ..
+*     .. EXTERNAL FUNCTIONS ..
+      LOGICAL            LSAME
+      EXTERNAL           LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZGEMM, ZTRMM
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
+     $   RETURN
+*
+      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
+*
+*     ------------------------------------------------------------------
+*
+*     First Step. Computation of the Column Block 2:
+*
+*        ( A2 ) := H * ( A2 )
+*        ( B2 )        ( B2 )
+*
+*     ------------------------------------------------------------------
+*
+      IF( N.GT.K ) THEN
+*
+*        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
+*        into W2=WORK(1:K, 1:N-K) column-by-column.
+*
+         DO J = 1, N-K
+            CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
+         END DO
+
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
+*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+*
+            CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
+     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
+         END IF
+*
+*        col2_(4) Compute W2: = T * W2,
+*        T is upper-triangular.
+*
+         CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
+     $               WORK, LDWORK )
+*
+*        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
+*        V2 stored in B1.
+*
+         IF( M.GT.0 ) THEN
+            CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
+     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
+         END IF
+*
+         IF( LNOTIDENT ) THEN
+*
+*           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
+*           V1 is not an identity matrix, but unit lower-triangular,
+*           V1 stored in A1 (diagonal ones are not stored).
+*
+            CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
+     $                  WORK, LDWORK )
+         END IF
+*
+*        col2_(7) Compute A2: = A2 - W2 =
+*                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
+*        column-by-column.
+*
+         DO J = 1, N-K
+            DO I = 1, K
+               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     ------------------------------------------------------------------
+*
+*     Second Step. Computation of the Column Block 1:
+*
+*        ( A1 ) := H * ( A1 )
+*        ( B1 )        (  0 )
+*
+*     ------------------------------------------------------------------
+*
+*     col1_(1) Compute W1: = A1. Copy the upper-triangular
+*     A1 = A(1:K, 1:K) into the upper-triangular
+*     W1 = WORK(1:K, 1:K) column-by-column.
+*
+      DO J = 1, K
+         CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
+      END DO
+*
+*     Set the subdiagonal elements of W1 to zero column-by-column.
+*
+      DO J = 1, K - 1
+         DO I = J + 1, K
+            WORK( I, J ) = CZERO
+         END DO
+      END DO
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular with zeroes below the diagonal.
+*
+         CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+      END IF
+*
+*     col1_(3) Compute W1: = T * W1,
+*     T is upper-triangular,
+*     W1 is upper-triangular with zeroes below the diagonal.
+*
+      CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
+     $            WORK, LDWORK )
+*
+*     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
+*     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
+*
+      IF( M.GT.0 ) THEN
+         CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
+     $               B, LDB )
+      END IF
+*
+      IF( LNOTIDENT ) THEN
+*
+*        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
+*        V1 is not an identity matrix, but unit lower-triangular
+*        V1 stored in A1 (diagonal ones are not stored),
+*        W1 is upper-triangular on input with zeroes below the diagonal,
+*        and square on output.
+*
+         CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
+     $               WORK, LDWORK )
+*
+*        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
+*        column-by-column. A1 is upper-triangular on input.
+*        If IDENT, A1 is square on output, and W1 is square,
+*        if NOT IDENT, A1 is upper-triangular on output,
+*        W1 is upper-triangular.
+*
+*        col1_(6)_a Compute elements of A1 below the diagonal.
+*
+         DO J = 1, K - 1
+            DO I = J + 1, K
+               A( I, J ) = - WORK( I, J )
+            END DO
+         END DO
+*
+      END IF
+*
+*     col1_(6)_b Compute elements of A1 on and above the diagonal.
+*
+      DO J = 1, K
+         DO I = 1, J
+            A( I, J ) = A( I, J ) - WORK( I, J )
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of ZLARFB_GETT
+*
+      END
diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f
index 23976dbef..8d10e3c2e 100644
--- a/lapack-netlib/SRC/zlarrv.f
+++ b/lapack-netlib/SRC/zlarrv.f
@@ -351,7 +351,7 @@
 *
 *     Quick return if possible
 *
-      IF( N.LE.0 ) THEN
+      IF( (N.LE.0).OR.(M.LE.0) ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/ztgsja.f b/lapack-netlib/SRC/ztgsja.f
index 851f6504a..c80e33158 100644
--- a/lapack-netlib/SRC/ztgsja.f
+++ b/lapack-netlib/SRC/ztgsja.f
@@ -401,7 +401,7 @@
 *     .. Parameters ..
       INTEGER            MAXIT
       PARAMETER          ( MAXIT = 40 )
-      DOUBLE PRECISION   ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE, HUGENUM
       PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
       COMPLEX*16         CZERO, CONE
       PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
@@ -424,7 +424,8 @@
      $                   ZLASET, ZROT
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, DBLE, DCONJG, MAX, MIN
+      INTRINSIC          ABS, DBLE, DCONJG, MAX, MIN, HUGE
+      PARAMETER          ( HUGENUM = HUGE(ZERO) )
 *     ..
 *     .. Executable Statements ..
 *
@@ -610,9 +611,9 @@
 *
          A1 = DBLE( A( K+I, N-L+I ) )
          B1 = DBLE( B( I, N-L+I ) )
+         GAMMA = B1 / A1
 *
-         IF( A1.NE.ZERO ) THEN
-            GAMMA = B1 / A1
+         IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN
 *
             IF( GAMMA.LT.ZERO ) THEN
                CALL ZDSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB )
diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f
index 3cdb8127d..c1c35822c 100644
--- a/lapack-netlib/SRC/zungbr.f
+++ b/lapack-netlib/SRC/zungbr.f
@@ -222,8 +222,8 @@
                CALL ZUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( M.GT.1 ) THEN
-                  CALL ZUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL ZUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          ELSE
@@ -231,8 +231,8 @@
                CALL ZUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO )
             ELSE
                IF( N.GT.1 ) THEN
-                  CALL ZUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK,
-     $                         -1, IINFO )
+                  CALL ZUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1,
+     $                         IINFO )
                END IF
             END IF
          END IF
diff --git a/lapack-netlib/SRC/zungtsqr_row.f b/lapack-netlib/SRC/zungtsqr_row.f
new file mode 100644
index 000000000..0d32ad6ce
--- /dev/null
+++ b/lapack-netlib/SRC/zungtsqr_row.f
@@ -0,0 +1,380 @@
+*> \brief \b ZUNGTSQR_ROW
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZUNGTSQR_ROW + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+*      $                         LWORK, INFO )
+*       IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
+*> orthonormal columns from the output of ZLATSQR. These N orthonormal
+*> columns are the first N columns of a product of complex unitary
+*> matrices Q(k)_in of order M, which are returned by ZLATSQR in
+*> a special format.
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> The input matrices Q(k)_in are stored in row and column blocks in A.
+*> See the documentation of ZLATSQR for more details on the format of
+*> Q(k)_in, where each Q(k)_in is represented by block Householder
+*> transformations. This routine calls an auxiliary routine ZLARFB_GETT,
+*> where the computation is performed on each individual block. The
+*> algorithm first sweeps NB-sized column blocks from the right to left
+*> starting in the bottom row block and continues to the top row block
+*> (hence _ROW in the routine name). This sweep is in reverse order of
+*> the order in which ZLATSQR generates the output blocks.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by ZLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by ZLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not used as
+*>             input. The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by ZLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored). See ZLATSQR for more details.
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of the NIRB block reflector
+*>          sequences is stored in a larger NB-by-N column block of T
+*>          and consists of NICB smaller NB-by-NB upper-triangular
+*>          column blocks. See ZLATSQR for more details on the format
+*>          of T.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.
+*>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
+*>          where NBLOCAL=MIN(NB,N).
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2020, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*>
+*  =====================================================================
+      SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
+     $                         LWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
+     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
+     $                   KB, KB_LAST, KNB, MB1
+*     ..
+*     .. Local Arrays ..
+      COMPLEX*16         DUMMY( 1, 1 )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLARFB_GETT, ZLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      LQUERY  = LWORK.EQ.-1
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
+         INFO = -10
+      END IF
+*
+      NBLOCAL = MIN( NB, N )
+*
+*     Determine the workspace size.
+*
+      IF( INFO.EQ.0 ) THEN
+         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
+      END IF
+*
+*     Handle error in the input parameters and handle the workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZUNGTSQR_ROW', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (0) Set the upper-triangular part of the matrix A to zero and
+*     its diagonal elements to one.
+*
+      CALL ZLASET('U', M, N, CZERO, CONE, A, LDA )
+*
+*     KB_LAST is the column index of the last column block reflector
+*     in the matrices T and V.
+*
+      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
+*
+*
+*     (1) Bottom-up loop over row blocks of A, except the top row block.
+*     NOTE: If MB>=M, then the loop is never executed.
+*
+      IF ( MB.LT.M ) THEN
+*
+*        MB2 is the row blocking size for the row blocks before the
+*        first top row block in the matrix A. IB is the row index for
+*        the row blocks in the matrix A before the first top row block.
+*        IB_BOTTOM is the row index for the last bottom row block
+*        in the matrix A. JB_T is the column index of the corresponding
+*        column block in the matrix T.
+*
+*        Initialize variables.
+*
+*        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
+*        including the first row block.
+*
+         MB2 = MB - N
+         M_PLUS_ONE = M + 1
+         ITMP = ( M - MB - 1 ) / MB2
+         IB_BOTTOM = ITMP * MB2 + MB + 1
+         NUM_ALL_ROW_BLOCKS = ITMP + 2
+         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
+*
+         DO IB = IB_BOTTOM, MB+1, -MB2
+*
+*           Determine the block size IMB for the current row block
+*           in the matrix A.
+*
+            IMB = MIN( M_PLUS_ONE - IB, MB2 )
+*
+*           Determine the column index JB_T for the current column block
+*           in the matrix T.
+*
+            JB_T = JB_T - N
+*
+*           Apply column blocks of H in the row block from right to left.
+*
+*           KB is the column index of the current column block reflector
+*           in the matrices T and V.
+*
+            DO KB = KB_LAST, 1, -NBLOCAL
+*
+*              Determine the size of the current column block KNB in
+*              the matrices T and V.
+*
+               KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+               CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB,
+     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
+     $                     A( IB, KB ), LDA, WORK, KNB )
+*
+            END DO
+*
+         END DO
+*
+      END IF
+*
+*     (2) Top row block of A.
+*     NOTE: If MB>=M, then we have only one row block of A of size M
+*     and we work on the entire matrix A.
+*
+      MB1 = MIN( MB, M )
+*
+*     Apply column blocks of H in the top row block from right to left.
+*
+*     KB is the column index of the current block reflector in
+*     the matrices T and V.
+*
+      DO KB = KB_LAST, 1, -NBLOCAL
+*
+*        Determine the size of the current column block KNB in
+*        the matrices T and V.
+*
+         KNB = MIN( NBLOCAL, N - KB + 1 )
+*
+         IF( MB1-KB-KNB+1.EQ.0 ) THEN
+*
+*           In SLARFB_GETT parameters, when M=0, then the matrix B
+*           does not exist, hence we need to pass a dummy array
+*           reference DUMMY(1,1) to B with LDDUMMY=1.
+*
+            CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
+         ELSE
+            CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
+     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
+     $                        A( KB+KNB, KB), LDA, WORK, KNB )
+
+         END IF
+*
+      END DO
+*
+      WORK( 1 ) = DCMPLX( LWORKOPT )
+      RETURN
+*
+*     End of ZUNGTSQR_ROW
+*
+      END
diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt
index 80e6b3232..b4e2223f7 100644
--- a/lapack-netlib/TESTING/CMakeLists.txt
+++ b/lapack-netlib/TESTING/CMakeLists.txt
@@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND)
 endif()
 
 
-
+if(WIN32)
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1
+"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n"
+"$ErrorActionPreference = \"Stop\"\n"
+"Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n"
+"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n"
+"echo Error\n"
+"exit 1\n"
+"} else {\n"
+"exit 0\n"
+"}\n"
+)
+set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1")
+else()
 # $1 exec, $2 input, $3 output_result
 FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
 "rm -f $3\n"
@@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
 "exit 0\n"
 "fi\n"
 )
-
+set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh")
+endif()
 
 add_test(NAME "REAL_LAPACK_linear_equation_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
 )
 add_test(NAME "COMPLEX_LAPACK_linear_equation_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
 )
 add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
 )
 add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
 )
 
 add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
 )
 # ======== COMPLEX-COMPLEX16 LIN TESTS ========================
 
 add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines"
-	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
+	 COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
 )
 
 # ======== SINGLE RFP LIN TESTS ========================
 
 add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines"
-	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
+	 COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
 )
 
 # ======== COMPLEX16 RFP LIN TESTS ========================
 
 add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines"
-	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
+	 COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
 )
 # ======== COMPLEX16 RFP LIN TESTS ========================
 
 add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines"
-	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
+	 COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
 )
 
 # ======== COMPLEX16 RFP LIN TESTS ========================
 
 add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
-	 COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
+	 COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
 )
 #
 #
@@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
 #
 
 add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
 )
 
 add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
 )
 
 add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
 )
 
 add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
 )
 
 add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
 )
 
 add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
 )
 
 add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
 )
 
 add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
 )
 
 add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
 )
 
 add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
 )
 
 add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
 )
 
 add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
 )
 
 add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
 )
 
 add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
 )
 
 add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
 )
 
 add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
 )
 
 add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
 )
 
 add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
 )
 
 add_test(NAME "SCSD:_Testing_CS_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
 )
 
 add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
 )
 
 # ======== COMPLEX EIG TESTS ===========================
 
 add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
 )
 
 add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
 )
 
 add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
 )
 
 add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
 )
 
 add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
 )
 
 add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
 )
 
 add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
 )
 
 add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
 )
 
 add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
 )
 
 add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
 )
 
 add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
 )
 
 add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
 )
 
 add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
 )
 
 add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
 )
 
 add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
 )
 
 add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
 )
 
 add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
 )
 
 add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
 )
 
 add_test(NAME "CCSD:_Testing_CS_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
 )
 
 add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
 )
 
 # ======== DOUBLE EIG TESTS ===========================
 
 add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
 )
 
 add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
 )
 
 add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
 )
 
 add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
 )
 
 add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
 )
 
 add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
 )
 
 add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
 )
 
 add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
 )
 
 add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
 )
 
 add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
 )
 
 add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
 )
 
 add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
 )
 
 add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
 )
 
 add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
 )
 
 add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
 )
 
 add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
 )
 
 add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
 )
 
 add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
 )
 
 add_test(NAME "DCSD:_Testing_CS_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
 )
 
 add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
 )
 
 # ======== COMPLEX16 EIG TESTS ===========================
 
 add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
 )
 
 add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
 )
 
 add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
 )
 
 add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
 )
 
 add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
 )
 
 add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
 )
 
 add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
 )
 
 add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
 )
 
 add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
 )
 
 add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
 )
 
 add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
 )
 
 add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
 )
 
 add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
 )
 
 add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
 )
 
 add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
 )
 
 add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
 )
 
 add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
 )
 
 add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
 )
 
 add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
 )
 
 add_test(NAME "Constrained_Linear_Least_Squares_routines"
-	COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
+	COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
 )
diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index e877b1422..10c25a446 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -25,7 +25,7 @@ set(AEIGTST
 set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f
    ssvdch.f ssvdct.f ssxt1.f)
 
-set(SEIGTST schkee.f
+set(SEIGTST schkee.F
    sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f
    schkbb.f schkbd.f schkbk.f schkbl.f schkec.f
    schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f
@@ -42,7 +42,7 @@ set(SEIGTST schkee.f
    sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
    sstt22.f ssyt21.f ssyt22.f)
 
-set(CEIGTST cchkee.f
+set(CEIGTST cchkee.F
    cbdt01.f cbdt02.f cbdt03.f cbdt05.f
    cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f
    cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f
@@ -62,7 +62,7 @@ set(CEIGTST cchkee.f
 set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
    dsvdch.f dsvdct.f dsxt1.f)
 
-set(DEIGTST dchkee.f
+set(DEIGTST dchkee.F
    dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f
    dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f
    dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f
@@ -79,7 +79,7 @@ set(DEIGTST dchkee.f
    dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
    dstt22.f dsyt21.f dsyt22.f)
 
-set(ZEIGTST zchkee.f
+set(ZEIGTST zchkee.F
    zbdt01.f zbdt02.f zbdt03.f zbdt05.f
    zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f
    zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f
diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index b3efebcd0..a292e4496 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -157,11 +157,11 @@ cleanobj:
 cleanexe:
 	rm -f xeigtst*
 
-schkee.o: schkee.f
+schkee.o: schkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
-dchkee.o: dchkee.f
+dchkee.o: dchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
-cchkee.o: cchkee.f
+cchkee.o: cchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
-zchkee.o: zchkee.f
+zchkee.o: zchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
diff --git a/lapack-netlib/TESTING/EIG/cbdt05.f b/lapack-netlib/TESTING/EIG/cbdt05.f
index 5a08ccce3..4ed157431 100644
--- a/lapack-netlib/TESTING/EIG/cbdt05.f
+++ b/lapack-netlib/TESTING/EIG/cbdt05.f
@@ -158,9 +158,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX
-      REAL               SASUM, SLAMCH, CLANGE
-      EXTERNAL           LSAME, ISAMAX, SASUM, SLAMCH, CLANGE
-      REAL               SCASUM
+      REAL               SASUM, SCASUM, SLAMCH, CLANGE
+      EXTERNAL           LSAME, ISAMAX, SASUM, SCASUM, SLAMCH, CLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM
diff --git a/lapack-netlib/TESTING/EIG/cchkee.f b/lapack-netlib/TESTING/EIG/cchkee.F
similarity index 97%
rename from lapack-netlib/TESTING/EIG/cchkee.f
rename to lapack-netlib/TESTING/EIG/cchkee.F
index f2a5f8d41..ef9f71ec9 100644
--- a/lapack-netlib/TESTING/EIG/cchkee.f
+++ b/lapack-netlib/TESTING/EIG/cchkee.F
@@ -1034,6 +1034,10 @@
 *  =====================================================================
       PROGRAM CCHKEE
 *
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
 *  -- LAPACK test routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@@ -1072,6 +1076,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4          N_THREADS, ONE_THREAD
       REAL               EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1084,12 +1089,16 @@
       INTEGER            INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
      $                   ISHFTS( MAXIN ), IACC22( MAXIN )
       REAL               ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ),
-     $                   RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX )
-      COMPLEX            A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
-     $                   C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ),
-     $                   TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ),
+     $                   RESULT( 500 )
+      COMPLEX            DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ),
      $                   X( 5*NMAX )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK
+      COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN
       REAL               SECOND, SLAMCH
@@ -1130,6 +1139,21 @@
       DATA               INTSTR / '0123456789' /
       DATA               IOLDSD / 0, 0, 0, 1 /
 *     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
       A = 0.0
@@ -1846,8 +1870,17 @@
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
          CALL XLAENV( 1, 1 )
          CALL XLAENV( 9, 25 )
-         IF( TSTERR )
-     $      CALL CERRST( 'CST', NOUT )
+         IF( TSTERR ) THEN
+#if defined(_OPENMP)
+            N_THREADS = OMP_GET_MAX_THREADS()
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
+#endif
+            CALL CERRST( 'CST', NOUT )
+#if defined(_OPENMP)
+            CALL OMP_SET_NUM_THREADS(N_THREADS)
+#endif
+         END IF
          DO 290 I = 1, NPARMS
             CALL XLAENV( 1, NBVAL( I ) )
             CALL XLAENV( 2, NBMIN( I ) )
@@ -2305,8 +2338,17 @@
          MAXTYP = 15
          NTYPES = MIN( MAXTYP, NTYPES )
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
-         IF( TSTERR )
-     $      CALL CERRST( 'CHB', NOUT )
+         IF( TSTERR ) THEN
+#if defined(_OPENMP)
+            N_THREADS = OMP_GET_MAX_THREADS()
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
+#endif
+            CALL CERRST( 'CHB', NOUT )
+#if defined(_OPENMP)
+            CALL OMP_SET_NUM_THREADS(N_THREADS)
+#endif
+         END IF
 *         CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH,
 *     $                NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ),
 *     $                A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT,
@@ -2436,7 +2478,14 @@
   380 CONTINUE
       WRITE( NOUT, FMT = 9994 )
       S2 = SECOND( )
-      WRITE( NOUT, FMT = 9993 )S2 - S1
+      WRITE( NOUT, FMT = 9993 )S2 - S1    
+*
+      DEALLOCATE (S, STAT = AllocateStatus)
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (RWORK, STAT = AllocateStatus)
+      DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9997 FORMAT( / / 1X, A3, ':  NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
diff --git a/lapack-netlib/TESTING/EIG/cckcsd.f b/lapack-netlib/TESTING/EIG/cckcsd.f
index 9783f0361..9524cb30b 100644
--- a/lapack-netlib/TESTING/EIG/cckcsd.f
+++ b/lapack-netlib/TESTING/EIG/cckcsd.f
@@ -228,7 +228,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAHDG, ALAREQ, ALASUM, CCSDTS, CLACSG, CLAROR,
-     $                   CLASET
+     $                   CLASET, CSROT
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN
diff --git a/lapack-netlib/TESTING/EIG/dchkee.f b/lapack-netlib/TESTING/EIG/dchkee.F
similarity index 98%
rename from lapack-netlib/TESTING/EIG/dchkee.f
rename to lapack-netlib/TESTING/EIG/dchkee.F
index dc6f3205a..89b6958fe 100644
--- a/lapack-netlib/TESTING/EIG/dchkee.f
+++ b/lapack-netlib/TESTING/EIG/dchkee.F
@@ -1038,7 +1038,11 @@
 *> \ingroup double_eig
 *
 *  =====================================================================
-      PROGRAM DCHKEE
+      PROGRAM DCHKEE      
+*
+#if defined(_OPENMP)
+      use omp_lib
+#endif
 *
 *  -- LAPACK test routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -1078,6 +1082,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4          N_THREADS, ONE_THREAD
       DOUBLE PRECISION   EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1089,10 +1094,13 @@
      $                   PVAL( MAXIN )
       INTEGER            INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
      $                   ISHFTS( MAXIN ), IACC22( MAXIN )
-      DOUBLE PRECISION   A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
-     $                   C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ),
-     $                   RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ),
-     $                   WORK( LWORK ), X( 5*NMAX )
+      DOUBLE PRECISION   D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ),
+     $                   TAUB( NMAX ), X( 5*NMAX )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN
@@ -1132,7 +1140,18 @@
 *     ..
 *     .. Data statements ..
       DATA               INTSTR / '0123456789' /
-      DATA               IOLDSD / 0, 0, 0, 1 /
+      DATA               IOLDSD / 0, 0, 0, 1 /  
+*     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
 *     .. Executable Statements ..
 *
@@ -1856,8 +1875,17 @@
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
          CALL XLAENV( 1, 1 )
          CALL XLAENV( 9, 25 )
-         IF( TSTERR )
-     $      CALL DERRST( 'DST', NOUT )
+         IF( TSTERR ) THEN
+#if defined(_OPENMP)
+            N_THREADS = OMP_GET_MAX_THREADS()
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
+#endif
+            CALL DERRST( 'DST', NOUT )
+#if defined(_OPENMP)
+            CALL OMP_SET_NUM_THREADS(N_THREADS)
+#endif
+         END IF
          DO 290 I = 1, NPARMS
             CALL XLAENV( 1, NBVAL( I ) )
             CALL XLAENV( 2, NBMIN( I ) )
@@ -2436,7 +2464,12 @@
   380 CONTINUE
       WRITE( NOUT, FMT = 9994 )
       S2 = DSECND( )
-      WRITE( NOUT, FMT = 9993 )S2 - S1
+      WRITE( NOUT, FMT = 9993 )S2 - S1     
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9997 FORMAT( / / 1X, A3, ':  NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
diff --git a/lapack-netlib/TESTING/EIG/dckcsd.f b/lapack-netlib/TESTING/EIG/dckcsd.f
index 50db6baa0..063a5ef5c 100644
--- a/lapack-netlib/TESTING/EIG/dckcsd.f
+++ b/lapack-netlib/TESTING/EIG/dckcsd.f
@@ -226,7 +226,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAHDG, ALAREQ, ALASUM, DCSDTS, DLACSG, DLAROR,
-     $                   DLASET
+     $                   DLASET, DROT
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN
diff --git a/lapack-netlib/TESTING/EIG/schkee.f b/lapack-netlib/TESTING/EIG/schkee.F
similarity index 98%
rename from lapack-netlib/TESTING/EIG/schkee.f
rename to lapack-netlib/TESTING/EIG/schkee.F
index 3757e0655..b58433959 100644
--- a/lapack-netlib/TESTING/EIG/schkee.f
+++ b/lapack-netlib/TESTING/EIG/schkee.F
@@ -1040,6 +1040,10 @@
 *  =====================================================================
       PROGRAM SCHKEE
 *
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
 *  -- LAPACK test routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@@ -1078,6 +1082,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4          N_THREADS, ONE_THREAD
       REAL               EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1089,10 +1094,13 @@
      $                   PVAL( MAXIN )
       INTEGER            INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
      $                   ISHFTS( MAXIN ), IACC22( MAXIN )
-      REAL               A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
-     $                   C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ),
-     $                   RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ),
-     $                   WORK( LWORK ), X( 5*NMAX )
+      REAL               D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ),
+     $                   TAUB( NMAX ), X( 5*NMAX )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      REAL, DIMENSION(:), ALLOCATABLE :: WORK
+      REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN
@@ -1132,7 +1140,18 @@
 *     ..
 *     .. Data statements ..
       DATA               INTSTR / '0123456789' /
-      DATA               IOLDSD / 0, 0, 0, 1 /
+      DATA               IOLDSD / 0, 0, 0, 1 / 
+*     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
 *     .. Executable Statements ..
 *
@@ -1857,8 +1876,17 @@
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
          CALL XLAENV( 1, 1 )
          CALL XLAENV( 9, 25 )
-         IF( TSTERR )
-     $      CALL SERRST( 'SST', NOUT )
+         IF( TSTERR ) THEN
+#if defined(_OPENMP)
+            N_THREADS = OMP_GET_MAX_THREADS()
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
+#endif
+            CALL SERRST( 'SST', NOUT )
+#if defined(_OPENMP)
+            CALL OMP_SET_NUM_THREADS(N_THREADS)
+#endif
+         END IF
          DO 290 I = 1, NPARMS
             CALL XLAENV( 1, NBVAL( I ) )
             CALL XLAENV( 2, NBMIN( I ) )
@@ -2440,6 +2468,11 @@
       WRITE( NOUT, FMT = 9994 )
       S2 = SECOND( )
       WRITE( NOUT, FMT = 9993 )S2 - S1
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9997 FORMAT( / / 1X, A3, ':  NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
diff --git a/lapack-netlib/TESTING/EIG/sckcsd.f b/lapack-netlib/TESTING/EIG/sckcsd.f
index 5a6e4a099..be91eed51 100644
--- a/lapack-netlib/TESTING/EIG/sckcsd.f
+++ b/lapack-netlib/TESTING/EIG/sckcsd.f
@@ -226,7 +226,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAHDG, ALAREQ, ALASUM, SCSDTS, SLACSG, SLAROR,
-     $                   SLASET
+     $                   SLASET, SROT
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN
diff --git a/lapack-netlib/TESTING/EIG/zbdt05.f b/lapack-netlib/TESTING/EIG/zbdt05.f
index bbf0208b7..f262351e4 100644
--- a/lapack-netlib/TESTING/EIG/zbdt05.f
+++ b/lapack-netlib/TESTING/EIG/zbdt05.f
@@ -158,9 +158,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            IDAMAX
-      DOUBLE PRECISION   DASUM, DLAMCH, ZLANGE
-      EXTERNAL           LSAME, IDAMAX, DASUM, DLAMCH, ZLANGE
-      DOUBLE PRECISION   DZASUM
+      DOUBLE PRECISION   DASUM, DZASUM, DLAMCH, ZLANGE
+      EXTERNAL           LSAME, IDAMAX, DASUM, DZASUM, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZGEMM
diff --git a/lapack-netlib/TESTING/EIG/zchkee.f b/lapack-netlib/TESTING/EIG/zchkee.F
similarity index 97%
rename from lapack-netlib/TESTING/EIG/zchkee.f
rename to lapack-netlib/TESTING/EIG/zchkee.F
index 6807ef7e4..fb418a43b 100644
--- a/lapack-netlib/TESTING/EIG/zchkee.f
+++ b/lapack-netlib/TESTING/EIG/zchkee.F
@@ -1034,6 +1034,10 @@
 *  =====================================================================
       PROGRAM ZCHKEE
 *
+#if defined(_OPENMP)
+      use omp_lib
+#endif
+*
 *  -- LAPACK test routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@@ -1072,6 +1076,7 @@
       INTEGER            I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
      $                   NK, NN, NPARMS, NRHS, NTYPES,
      $                   VERS_MAJOR, VERS_MINOR, VERS_PATCH
+      INTEGER*4          N_THREADS, ONE_THREAD
       DOUBLE PRECISION   EPS, S1, S2, THRESH, THRSHN
 *     ..
 *     .. Local Arrays ..
@@ -1084,12 +1089,16 @@
       INTEGER            INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
      $                   ISHFTS( MAXIN ), IACC22( MAXIN )
       DOUBLE PRECISION   ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ),
-     $                   RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX )
-      COMPLEX*16         A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
-     $                   C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ),
-     $                   TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ),
+     $                   RESULT( 500 )
+      COMPLEX*16         DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ),
      $                   X( 5*NMAX )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK
+      COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN
       DOUBLE PRECISION   DLAMCH, DSECND
@@ -1130,6 +1139,21 @@
       DATA               INTSTR / '0123456789' /
       DATA               IOLDSD / 0, 0, 0, 1 /
 *     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
       A = 0.0
@@ -1846,8 +1870,17 @@
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
          CALL XLAENV( 1, 1 )
          CALL XLAENV( 9, 25 )
-         IF( TSTERR )
-     $      CALL ZERRST( 'ZST', NOUT )
+         IF( TSTERR ) THEN
+#if defined(_OPENMP)
+            N_THREADS = OMP_GET_MAX_THREADS()
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
+#endif
+            CALL ZERRST( 'ZST', NOUT )
+#if defined(_OPENMP)
+            CALL OMP_SET_NUM_THREADS(N_THREADS)
+#endif
+         END IF
          DO 290 I = 1, NPARMS
             CALL XLAENV( 1, NBVAL( I ) )
             CALL XLAENV( 2, NBMIN( I ) )
@@ -2303,8 +2336,17 @@
          MAXTYP = 15
          NTYPES = MIN( MAXTYP, NTYPES )
          CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
-         IF( TSTERR )
-     $      CALL ZERRST( 'ZHB', NOUT )
+         IF( TSTERR ) THEN
+#if defined(_OPENMP)
+            N_THREADS = OMP_GET_MAX_THREADS()
+            ONE_THREAD = 1
+            CALL OMP_SET_NUM_THREADS(ONE_THREAD)
+#endif
+            CALL ZERRST( 'ZHB', NOUT )
+#if defined(_OPENMP)
+            CALL OMP_SET_NUM_THREADS(N_THREADS)
+#endif
+         END IF
 *         CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH,
 *     $                NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ),
 *     $                A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT,
@@ -2435,6 +2477,13 @@
       WRITE( NOUT, FMT = 9994 )
       S2 = DSECND( )
       WRITE( NOUT, FMT = 9993 )S2 - S1
+*
+      DEALLOCATE (S, STAT = AllocateStatus)
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (RWORK, STAT = AllocateStatus)
+      DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9997 FORMAT( / / 1X, A3, ':  NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
diff --git a/lapack-netlib/TESTING/EIG/zckcsd.f b/lapack-netlib/TESTING/EIG/zckcsd.f
index f77b111a4..92760337c 100644
--- a/lapack-netlib/TESTING/EIG/zckcsd.f
+++ b/lapack-netlib/TESTING/EIG/zckcsd.f
@@ -228,7 +228,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAHDG, ALAREQ, ALASUM, ZCSDTS, ZLACSG, ZLAROR,
-     $                   ZLASET
+     $                   ZLASET, ZDROT
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 0d0bb5418..fc55b8a96 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -6,7 +6,7 @@ set(SCLNTST slaord.f)
 
 set(DZLNTST dlaord.f)
 
-set(SLINTST schkaa.f
+set(SLINTST schkaa.F
    schkeq.f schkgb.f schkge.f schkgt.f
    schklq.f schkpb.f schkpo.f schkps.f schkpp.f
    schkpt.f schkq3.f schkql.f schkqr.f schkrq.f
@@ -40,7 +40,7 @@ set(SLINTST schkaa.f
    sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f
    schklqt.f schklqtp.f schktsqr.f
    serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f
-   schkorhr_col.f serrorhr_col.f sorhr_col01.f)
+   schkorhr_col.f serrorhr_col.f sorhr_col01.f sorhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f
@@ -51,7 +51,7 @@ else()
                       serrvx.f serrge.f serrsy.f serrpo.f)
 endif()
 
-set(CLINTST cchkaa.f
+set(CLINTST cchkaa.F
    cchkeq.f cchkgb.f cchkge.f cchkgt.f
    cchkhe.f cchkhe_rook.f cchkhe_rk.f 
    cchkhe_aa.f cchkhe_aa_2stage.f
@@ -96,7 +96,7 @@ set(CLINTST cchkaa.f
    cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f
    cchklqt.f cchklqtp.f cchktsqr.f
    cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f
-   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f)
+   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f cunhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f
@@ -107,7 +107,7 @@ else()
                       cerrvx.f cerrge.f cerrhe.f cerrsy.f cerrpo.f)
 endif()
 
-set(DLINTST dchkaa.f
+set(DLINTST dchkaa.F
    dchkeq.f dchkgb.f dchkge.f dchkgt.f
    dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f
    dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f
@@ -142,7 +142,7 @@ set(DLINTST dchkaa.f
    dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f
    dchklq.f dchklqt.f dchklqtp.f dchktsqr.f
    derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f
-   dchkorhr_col.f derrorhr_col.f dorhr_col01.f)
+   dchkorhr_col.f derrorhr_col.f dorhr_col01.f dorhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f
@@ -153,7 +153,7 @@ else()
                       derrvx.f derrge.f derrsy.f derrpo.f)
 endif()
 
-set(ZLINTST zchkaa.f
+set(ZLINTST zchkaa.F
    zchkeq.f zchkgb.f zchkge.f zchkgt.f
    zchkhe.f zchkhe_rook.f zchkhe_rk.f 
    zchkhe_aa.f zchkhe_aa_2stage.f
@@ -198,7 +198,7 @@ set(ZLINTST zchkaa.f
    zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f
    zchklqt.f zchklqtp.f zchktsqr.f
    zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f
-   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f)
+   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f zunhr_col02.f)
 
 if(USE_XBLAS)
   list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 6e790aa93..54b26455e 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -74,7 +74,7 @@ SLINTST = schkaa.o \
    sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \
    schklqt.o schklqtp.o schktsqr.o \
    serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \
-   schkorhr_col.o serrorhr_col.o sorhr_col01.o
+   schkorhr_col.o serrorhr_col.o sorhr_col01.o sorhr_col02.o
 
 ifdef USEXBLAS
 SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \
@@ -123,7 +123,7 @@ CLINTST = cchkaa.o \
    cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \
    cchklqt.o cchklqtp.o cchktsqr.o \
    cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \
-   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o
+   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o cunhr_col02.o
 
 ifdef USEXBLAS
 CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \
@@ -167,7 +167,7 @@ DLINTST = dchkaa.o \
    dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \
    dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \
    derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \
-   dchkorhr_col.o derrorhr_col.o dorhr_col01.o
+   dchkorhr_col.o derrorhr_col.o dorhr_col01.o dorhr_col02.o
 
 ifdef USEXBLAS
 DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \
@@ -215,7 +215,7 @@ ZLINTST = zchkaa.o \
    zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \
    zchklqt.o zchklqtp.o zchktsqr.o \
    zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \
-   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o
+   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o zunhr_col02.o
 
 ifdef USEXBLAS
 ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \
@@ -317,13 +317,13 @@ cleanobj:
 cleanexe:
 	rm -f xlintst*
 
-schkaa.o: schkaa.f
+schkaa.o: schkaa.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
-dchkaa.o: dchkaa.f
+dchkaa.o: dchkaa.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
-cchkaa.o: cchkaa.f
+cchkaa.o: cchkaa.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
-zchkaa.o: zchkaa.f
+zchkaa.o: zchkaa.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 
 .NOTPARALLEL:
diff --git a/lapack-netlib/TESTING/LIN/cchkaa.f b/lapack-netlib/TESTING/LIN/cchkaa.F
similarity index 97%
rename from lapack-netlib/TESTING/LIN/cchkaa.f
rename to lapack-netlib/TESTING/LIN/cchkaa.F
index d36770be7..ec1534ed4 100644
--- a/lapack-netlib/TESTING/LIN/cchkaa.f
+++ b/lapack-netlib/TESTING/LIN/cchkaa.F
@@ -110,17 +110,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex_lin
 *
 *  =====================================================================
       PROGRAM CCHKAA
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2017
 *
 *  =====================================================================
 *
@@ -156,9 +153,13 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX )
-      COMPLEX            A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ),
-     $                   E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 )
+      REAL               S( 2*NMAX )
+      COMPLEX            E( NMAX )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -194,6 +195,17 @@
 *     .. Data statements ..
       DATA               THREQ / 2.0 / , INTSTR / '0123456789' /
 *     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
       S1 = SECOND( )
@@ -1196,6 +1208,11 @@
       S2 = SECOND( )
       WRITE( NOUT, FMT = 9998 )
       WRITE( NOUT, FMT = 9997 )S2 - S1
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (WORK, STAT = AllocateStatus)
+      DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9998 FORMAT( / ' End of tests' )
diff --git a/lapack-netlib/TESTING/LIN/cchktsqr.f b/lapack-netlib/TESTING/LIN/cchktsqr.f
index 8288916db..62b6ce434 100644
--- a/lapack-netlib/TESTING/LIN/cchktsqr.f
+++ b/lapack-netlib/TESTING/LIN/cchktsqr.f
@@ -159,6 +159,8 @@
 *
 *     Test the error exits
 *
+      CALL XLAENV( 1, 0 )
+      CALL XLAENV( 2, 0 ) 
       IF( TSTERR ) CALL CERRTSQR( PATH, NOUT )
       INFOT = 0
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
index 00077ddd9..0d6a9063d 100644
--- a/lapack-netlib/TESTING/LIN/cchkunhr_col.f
+++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
@@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR
-*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested
-*> before this test.
+*> CCHKUNHR_COL tests:
+*>   1) CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT,
+*>   2) CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT
+*>      (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
+*> have to be tested before this test.
 *>
 *> \endverbatim
 *
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex_lin
 *
 *  =====================================================================
-      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,10 +135,11 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01
+      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01,
+     $                   CUNHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -201,8 +202,8 @@
 *
 *                             Test CUNHR_COL
 *
-                              CALL CUNHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL CUNHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test CUNHR_COL
+*
+                              CALL CUNHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'CUNGTSQR and CUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'CUNGTSQR_ROW and CUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of CCHKUNHR_COL
diff --git a/lapack-netlib/TESTING/LIN/cdrvgex.f b/lapack-netlib/TESTING/LIN/cdrvgex.f
index 51fc84899..9b075908f 100644
--- a/lapack-netlib/TESTING/LIN/cdrvgex.f
+++ b/lapack-netlib/TESTING/LIN/cdrvgex.f
@@ -707,9 +707,10 @@
                      CALL CLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA )
 
                      IF( .NOT.PREFAC )
-     $                  CALL CLASET( 'Full', N, N, ZERO, ZERO, AFAC,
-     $                               LDA )
-                     CALL CLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA )
+     $                  CALL CLASET( 'Full', N, N, CMPLX( ZERO ),
+     $                               CMPLX( ZERO ), AFAC, LDA )
+                     CALL CLASET( 'Full', N, NRHS, CMPLX( ZERO ),
+     $                            CMPLX( ZERO ), X, LDA )
                      IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN
 *
 *                       Equilibrate the matrix if FACT = 'F' and
diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
index 32be41f64..959258e1f 100644
--- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
@@ -449,11 +449,11 @@
 *                    Reconstruct matrix from factors and compute
 *                    residual.
 *
-c                     CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA,
-c     $                                  IWORK, AINV, LDA, RWORK,
-c     $                                  RESULT( 2 ) )
-c                     NT = 2
-					  NT = 1
+c                    CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA,
+c    $                                  IWORK, AINV, LDA, RWORK,
+c    $                                  RESULT( 2 ) )
+c                    NT = 2
+                     NT = 1
 *
 *                    Print information about the tests that did not pass
 *                    the threshold.
diff --git a/lapack-netlib/TESTING/LIN/cdrvrfp.f b/lapack-netlib/TESTING/LIN/cdrvrfp.f
index a57688f83..362a0e7cb 100644
--- a/lapack-netlib/TESTING/LIN/cdrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/cdrvrfp.f
@@ -449,19 +449,19 @@
 *                       Form the inverse of A.
 *
                         CALL CPOTRI( UPLO, N, A, LDA, INFO )
+
+                        IF ( N .NE. 0 ) THEN
 *
-*                       Compute the 1-norm condition number of A.
+*                          Compute the 1-norm condition number of A.
 *
-      					IF ( N .NE. 0 ) THEN
                            AINVNM = CLANHE( '1', UPLO, N, A, LDA,
      +                           S_WORK_CLANHE )
                            RCONDC = ( ONE / ANORM ) / AINVNM
 *
 *                          Restore the matrix A.
 *
-                        CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA )
+                           CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA )
                         END IF
-
 *
                      END IF
 *
diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f
index d760caba5..d77d60b1a 100644
--- a/lapack-netlib/TESTING/LIN/cunhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f
@@ -13,7 +13,7 @@
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
-*       REAL              RESULT(6)
+*       DOUBLE PRECISION  RESULT(6)
 *
 *
 *> \par Purpose:
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR.
-*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR
+*> CUNHR_COL01 tests CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in CGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using CGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using CGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
-*> \ingroup complex16_lin
+*> \ingroup complex_lin
 *
 *  =====================================================================
       SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
@@ -102,10 +131,10 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      COMPLEX, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      COMPLEX         , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
      $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
      $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
-      REAL, ALLOCATABLE :: RWORK(:)
+      REAL            , ALLOCATABLE :: RWORK(:)
 *
 *     .. Parameters ..
       REAL               ZERO
@@ -218,7 +247,7 @@
 *     Copy the factor R into the array R.
 *
       SRNAMT = 'CLACPY'
-      CALL CLACPY( 'U', M, N, AF, M, R, M )
+      CALL CLACPY( 'U', N, N, AF, M, R, M )
 *
 *     Reconstruct the orthogonal matrix Q.
 *
@@ -240,7 +269,7 @@
 *     matrix S.
 *
       SRNAMT = 'CLACPY'
-      CALL CLACPY( 'U', M, N, R, M, AF, M )
+      CALL CLACPY( 'U', N, N, R, M, AF, M )
 *
       DO I = 1, N
          IF( DIAG( I ).EQ.-CONE ) THEN
diff --git a/lapack-netlib/TESTING/LIN/cunhr_col02.f b/lapack-netlib/TESTING/LIN/cunhr_col02.f
new file mode 100644
index 000000000..001f291da
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cunhr_col02.f
@@ -0,0 +1,381 @@
+*> \brief \b CUNHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       REAL              RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CUNHR_COL02 tests CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT
+*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is REAL array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in CGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using CGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using CGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      REAL              RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      COMPLEX         , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+      REAL            , ALLOCATABLE :: RWORK(:)
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0E+0 )
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      REAL               ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      COMPLEX            WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, CLANGE, CLANSY
+      EXTERNAL           SLAMCH, CLANGE, CLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLACPY, CLARNV, CLASET, CGETSQRHRT,
+     $                   CSCAL, CGEMM, CGEMQRT, CHERK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = SLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL CLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL CLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in CLATSQR
+*
+      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     CGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+*
+      CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'CGETSQRHRT'
+      CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M )
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M )
+*
+      CALL CLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
+*
+      ANORM = CLANGE( '1', M, N, A, M, RWORK )
+      RESID = CLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL CLASET( 'Full', M, M, CZERO, CONE, R, M )
+      CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
+      RESID = CLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL CLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = CLANGE( '1', M, N, C, M, RWORK )
+      CALL CLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = CLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL CLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = CLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL CLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = CLANGE( '1', N, M, D, N, RWORK )
+      CALL CLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = CLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL CLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = CLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of CUNHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.F
similarity index 96%
rename from lapack-netlib/TESTING/LIN/dchkaa.f
rename to lapack-netlib/TESTING/LIN/dchkaa.F
index 03575c4d1..ef9d7808c 100644
--- a/lapack-netlib/TESTING/LIN/dchkaa.f
+++ b/lapack-netlib/TESTING/LIN/dchkaa.F
@@ -106,17 +106,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup double_lin
 *
 *  =====================================================================
       PROGRAM DCHKAA
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     Novemebr 2019
 *
 *  =====================================================================
 *
@@ -152,9 +149,12 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ),
-     $                   E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ),
-     $                   S( 2*NMAX ), WORK( NMAX, 3*NMAX+MAXRHS+30 )
+      DOUBLE PRECISION   E( NMAX ), S( 2*NMAX )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK      
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -188,6 +188,18 @@
 *     .. Data statements ..
       DATA               THREQ / 2.0D0 / , INTSTR / '0123456789' /
 *     ..
+*     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+*
 *     .. Executable Statements ..
 *
       S1 = DSECND( )
@@ -677,7 +689,7 @@
 *
 *        SK:  symmetric indefinite matrices,
 *             with bounded Bunch-Kaufman (rook) pivoting algorithm,
-*             differnet matrix storage format than SR path version.
+*             different matrix storage format than SR path version.
 *
          NTYPES = 10
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
@@ -1039,6 +1051,11 @@
       S2 = DSECND( )
       WRITE( NOUT, FMT = 9998 )
       WRITE( NOUT, FMT = 9997 )S2 - S1
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (WORK, STAT = AllocateStatus)
+      DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9998 FORMAT( / ' End of tests' )
diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
index 3b3e421eb..0e2d44d8d 100644
--- a/lapack-netlib/TESTING/LIN/dchkorhr_col.f
+++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
@@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR
-*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested
-*> before this test.
+*> DCHKORHR_COL tests:
+*>   1) DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT,
+*>   2) DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT
+*>      (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
+*> have to be tested before this test.
 *>
 *> \endverbatim
 *
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup double_lin
 *
 *  =====================================================================
-      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,10 +135,11 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01
+      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01,
+     $                   DORHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -201,8 +202,8 @@
 *
 *                             Test DORHR_COL
 *
-                              CALL DORHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL DORHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test DORHR_COL
+*
+                              CALL DORHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'DORGTSQR and DORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'DORGTSQR_ROW and DORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of DCHKORHR_COL
diff --git a/lapack-netlib/TESTING/LIN/dchktsqr.f b/lapack-netlib/TESTING/LIN/dchktsqr.f
index c4b1f01bd..14119e6e5 100644
--- a/lapack-netlib/TESTING/LIN/dchktsqr.f
+++ b/lapack-netlib/TESTING/LIN/dchktsqr.f
@@ -159,6 +159,8 @@
 *
 *     Test the error exits
 *
+      CALL XLAENV( 1, 0 )
+      CALL XLAENV( 2, 0 )            
       IF( TSTERR ) CALL DERRTSQR( PATH, NOUT )
       INFOT = 0
 *
diff --git a/lapack-netlib/TESTING/LIN/ddrvrfp.f b/lapack-netlib/TESTING/LIN/ddrvrfp.f
index d67cf6713..18ccbdfc4 100644
--- a/lapack-netlib/TESTING/LIN/ddrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/ddrvrfp.f
@@ -443,8 +443,7 @@
 *
                         CALL DPOTRI( UPLO, N, A, LDA, INFO )
 
-      					IF ( N .NE. 0 ) THEN
-
+                        IF ( N .NE. 0 ) THEN
 *
 *                          Compute the 1-norm condition number of A.
 *
diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f
index 3e48de37f..979255ca9 100644
--- a/lapack-netlib/TESTING/LIN/dorhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR.
-*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR
+*> DORHR_COL01 tests DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using DGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using DGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
-*> \ingroup single_lin
+*> \ingroup double_lin
 *
 *  =====================================================================
       SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
diff --git a/lapack-netlib/TESTING/LIN/dorhr_col02.f b/lapack-netlib/TESTING/LIN/dorhr_col02.f
new file mode 100644
index 000000000..d4c438edb
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dorhr_col02.f
@@ -0,0 +1,377 @@
+*> \brief \b DORHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       DOUBLE PRECISION  RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DORHR_COL02 tests DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT
+*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is DOUBLE PRECISION array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using DGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using DGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      DOUBLE PRECISION  RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      DOUBLE PRECISION, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      DOUBLE PRECISION   WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLANSY
+      EXTERNAL           DLAMCH, DLANGE, DLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLACPY, DLARNV, DLASET, DGETSQRHRT,
+     $                   DSCAL, DGEMM, DGEMQRT, DSYRK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = DLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL DLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL DLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in DLATSQR
+*
+      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     DGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+*
+      CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'DGETSQRHRT'
+      CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M )
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M )
+*
+      CALL DLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
+*
+      ANORM = DLANGE( '1', M, N, A, M, RWORK )
+      RESID = DLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL DLASET( 'Full', M, M, ZERO, ONE, R, M )
+      CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
+      RESID = DLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL DLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = DLANGE( '1', M, N, C, M, RWORK )
+      CALL DLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = DLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL DLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = DLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL DLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = DLANGE( '1', N, M, D, N, RWORK )
+      CALL DLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = DLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL DLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = DLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of DORHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.F
similarity index 97%
rename from lapack-netlib/TESTING/LIN/schkaa.f
rename to lapack-netlib/TESTING/LIN/schkaa.F
index a9c13e442..a5b826d06 100644
--- a/lapack-netlib/TESTING/LIN/schkaa.f
+++ b/lapack-netlib/TESTING/LIN/schkaa.F
@@ -104,17 +104,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup single_lin
 *
 *  =====================================================================
       PROGRAM SCHKAA
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *  =====================================================================
 *
@@ -150,9 +147,12 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ),
-     $                   E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ),
-     $                   S( 2*NMAX ), WORK( NMAX, NMAX+MAXRHS+30 )
+      REAL               E( NMAX ), S( 2*NMAX )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -186,6 +186,17 @@
 *     .. Data statements ..
       DATA               THREQ / 2.0E0 / , INTSTR / '0123456789' /
 *     ..
+*     .. Allocate memory dynamically ..
+*
+      ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" 
+*     ..
 *     .. Executable Statements ..
 *
       S1 = SECOND( )
@@ -1034,6 +1045,11 @@
       S2 = SECOND( )
       WRITE( NOUT, FMT = 9998 )
       WRITE( NOUT, FMT = 9997 )S2 - S1
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (WORK, STAT = AllocateStatus)
+      DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9998 FORMAT( / ' End of tests' )
diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f
index cf6d2d323..f61b74902 100644
--- a/lapack-netlib/TESTING/LIN/schkorhr_col.f
+++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f
@@ -24,8 +24,11 @@
 *>
 *> \verbatim
 *>
-*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
-*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> SCHKORHR_COL tests:
+*>   1) SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT,
+*>   2) SORGTSQR_ROW and SORHR_COL inside DGETSQRHRT
+*>      (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
-*> \ingroup sigle_lin
+*> \ingroup single_lin
 *
 *  =====================================================================
-      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     June 2019
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,7 +135,8 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01
+      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01,
+     $                   SORHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -201,8 +202,8 @@
 *
 *                             Test SORHR_COL
 *
-                              CALL SORHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL SORHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test SORHR_COL
+*
+                              CALL SORHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'SORGTSQR and SORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'SORGTSQR_ROW and SORHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of SCHKORHR_COL
diff --git a/lapack-netlib/TESTING/LIN/schktsqr.f b/lapack-netlib/TESTING/LIN/schktsqr.f
index 2bed434a8..aa4d6f9c4 100644
--- a/lapack-netlib/TESTING/LIN/schktsqr.f
+++ b/lapack-netlib/TESTING/LIN/schktsqr.f
@@ -159,6 +159,8 @@
 *
 *     Test the error exits
 *
+      CALL XLAENV( 1, 0 )
+      CALL XLAENV( 2, 0 ) 
       IF( TSTERR ) CALL SERRTSQR( PATH, NOUT )
       INFOT = 0
 *
diff --git a/lapack-netlib/TESTING/LIN/sdrvrfp.f b/lapack-netlib/TESTING/LIN/sdrvrfp.f
index 4b022bcfb..c0eb4d564 100644
--- a/lapack-netlib/TESTING/LIN/sdrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/sdrvrfp.f
@@ -443,7 +443,7 @@
 *
                         CALL SPOTRI( UPLO, N, A, LDA, INFO )
 
-      					IF ( N .NE. 0 ) THEN
+                        IF ( N .NE. 0 ) THEN
 *
 *                          Compute the 1-norm condition number of A.
 *
diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f
index 02429041b..dcc2c1cae 100644
--- a/lapack-netlib/TESTING/LIN/sorhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f
@@ -8,12 +8,12 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT)
+*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
 *
 *       .. Scalar Arguments ..
 *       INTEGER           M, N, MB1, NB1, NB2
 *       .. Return values ..
-*       REAL             RESULT(6)
+*       REAL              RESULT(6)
 *
 *
 *> \par Purpose:
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
-*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> SORHR_COL01 tests SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is REAL array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in SGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using SGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using SGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup single_lin
 *
 *  =====================================================================
       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
@@ -102,7 +131,7 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      REAL, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      REAL            , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
      $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
      $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
 *
@@ -128,7 +157,7 @@
      $                   SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          CEILING, MAX, MIN, REAL
+      INTRINSIC          CEILING, REAL, MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       CHARACTER(LEN=32)  SRNAMT
@@ -230,7 +259,7 @@
 *
 *     Compute the factor R_hr corresponding to the Householder
 *     reconstructed Q_hr and place it in the upper triangle of AF to
-*     match the Q storage format in DGEQRT. R_hr = R_tsqr * S,
+*     match the Q storage format in SGEQRT. R_hr = R_tsqr * S,
 *     this means changing the sign of I-th row of the matrix R_tsqr
 *     according to sign of of I-th diagonal element DIAG(I) of the
 *     matrix S.
diff --git a/lapack-netlib/TESTING/LIN/sorhr_col02.f b/lapack-netlib/TESTING/LIN/sorhr_col02.f
new file mode 100644
index 000000000..1cbe40577
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/sorhr_col02.f
@@ -0,0 +1,376 @@
+*> \brief \b SORHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       REAL              RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SORHR_COL02 tests SORGTSQR_ROW and SORHR_COL inside SGETSQRHRT
+*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is REAL array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m orthogonal Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in SGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using SGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using SGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      REAL              RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      REAL            , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      REAL               ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      REAL               WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SLANGE, SLANSY
+      EXTERNAL           SLAMCH, SLANGE, SLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLACPY, SLARNV, SLASET, SGETSQRHRT,
+     $                   SSCAL, SGEMM, SGEMQRT, SSYRK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = SLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL SLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL SLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in SLATSQR
+*
+      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     SGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+      CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'SGETSQRHRT'
+      CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M )
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M )
+*
+      CALL SLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
+*
+      ANORM = SLANGE( '1', M, N, A, M, RWORK )
+      RESID = SLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL SLASET( 'Full', M, M, ZERO, ONE, R, M )
+      CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
+      RESID = SLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL SLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = SLANGE( '1', M, N, C, M, RWORK )
+      CALL SLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = SLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL SLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = SLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL SLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = SLANGE( '1', N, M, D, N, RWORK )
+      CALL SLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = SLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL SLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = SLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of SORHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.F
similarity index 97%
rename from lapack-netlib/TESTING/LIN/zchkaa.f
rename to lapack-netlib/TESTING/LIN/zchkaa.F
index 30d2a084a..a118515a5 100644
--- a/lapack-netlib/TESTING/LIN/zchkaa.f
+++ b/lapack-netlib/TESTING/LIN/zchkaa.F
@@ -110,17 +110,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex16_lin
 *
 *  =====================================================================
       PROGRAM ZCHKAA
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *  =====================================================================
 *
@@ -156,9 +153,13 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX )
-      COMPLEX*16         A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ),
-     $                   E( NMAX ),  WORK( NMAX, NMAX+MAXRHS+10 )
+      DOUBLE PRECISION   S( 2*NMAX )
+      COMPLEX*16         E( NMAX )
+*
+*    .. Allocatable Arrays ..
+      INTEGER AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::   RWORK
+      COMPLEX*16, DIMENSION(:,:), ALLOCATABLE::  A, B, WORK
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -194,6 +195,16 @@
 *     ..
 *     .. Data statements ..
       DATA               THREQ / 2.0D0 / , INTSTR / '0123456789' /
+*
+*     .. Allocate memory dynamically ..
+      ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
 *     .. Executable Statements ..
 *
@@ -1231,6 +1242,11 @@
       S2 = DSECND( )
       WRITE( NOUT, FMT = 9998 )
       WRITE( NOUT, FMT = 9997 )S2 - S1
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (RWORK, STAT = AllocateStatus)
+      DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
  9999 FORMAT( / ' Execution not attempted due to input errors' )
  9998 FORMAT( / ' End of tests' )
diff --git a/lapack-netlib/TESTING/LIN/zchktsqr.f b/lapack-netlib/TESTING/LIN/zchktsqr.f
index e6e6ac556..678b1772f 100644
--- a/lapack-netlib/TESTING/LIN/zchktsqr.f
+++ b/lapack-netlib/TESTING/LIN/zchktsqr.f
@@ -159,6 +159,8 @@
 *
 *     Test the error exits
 *
+      CALL XLAENV( 1, 0 )
+      CALL XLAENV( 2, 0 ) 
       IF( TSTERR ) CALL ZERRTSQR( PATH, NOUT )
       INFOT = 0
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
index ef8f8bcc4..395ea178a 100644
--- a/lapack-netlib/TESTING/LIN/zchkunhr_col.f
+++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
@@ -24,9 +24,12 @@
 *>
 *> \verbatim
 *>
-*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR
-*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested
-*> before this test.
+*> ZCHKUNHR_COL tests:
+*>   1) ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT,
+*>   2) ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT
+*>      (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
+*> have to be tested before this test.
 *>
 *> \endverbatim
 *
@@ -97,19 +100,16 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex16_lin
 *
 *  =====================================================================
-      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
-     $                         NBVAL, NOUT )
+      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL,
+     $                         NNB, NBVAL, NOUT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
 *
 *     .. Scalar Arguments ..
       LOGICAL            TSTERR
@@ -135,10 +135,11 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01
+      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01,
+     $                   ZUNHR_COL02
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC  MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -201,8 +202,8 @@
 *
 *                             Test ZUNHR_COL
 *
-                              CALL ZUNHR_COL01( M, N, MB1, NB1, NB2,
-     $                                          RESULT )
+                              CALL ZUNHR_COL01( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
 *
 *                             Print information about the tests that did
 *                             not pass the threshold.
@@ -226,12 +227,78 @@
          END DO
       END DO
 *
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test ZUNHR_COL
+*
+                              CALL ZUNHR_COL02( M, N, MB1, NB1,
+     $                                          NB2, RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9998 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
 *     Print a summary of the results.
 *
       CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
 *
- 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
-     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+ 9999 FORMAT( 'ZUNGTSQR and ZUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
+ 9998 FORMAT( 'ZUNGTSQR_ROW and ZUNHR_COL: M=', I5, ', N=', I5,
+     $        ', MB1=', I5, ', NB1=', I5, ', NB2=', I5,
+     $        ' test(', I2, ')=', G12.5 )
       RETURN
 *
 *     End of ZCHKUNHR_COL
diff --git a/lapack-netlib/TESTING/LIN/zdrvgex.f b/lapack-netlib/TESTING/LIN/zdrvgex.f
index cdfa10727..1b784d31b 100644
--- a/lapack-netlib/TESTING/LIN/zdrvgex.f
+++ b/lapack-netlib/TESTING/LIN/zdrvgex.f
@@ -707,9 +707,10 @@
                      CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA )
 
                      IF( .NOT.PREFAC )
-     $                  CALL ZLASET( 'Full', N, N, ZERO, ZERO, AFAC,
-     $                               LDA )
-                     CALL ZLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA )
+     $                  CALL ZLASET( 'Full', N, N, DCMPLX( ZERO ),
+     $                               DCMPLX( ZERO ), AFAC, LDA )
+                     CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                            DCMPLX( ZERO ), X, LDA )
                      IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN
 *
 *                       Equilibrate the matrix if FACT = 'F' and
diff --git a/lapack-netlib/TESTING/LIN/zdrvhex.f b/lapack-netlib/TESTING/LIN/zdrvhex.f
index 3c0dfbfe4..527114508 100644
--- a/lapack-netlib/TESTING/LIN/zdrvhex.f
+++ b/lapack-netlib/TESTING/LIN/zdrvhex.f
@@ -599,10 +599,10 @@
 *                 Restore the matrices A and B.
 *
                   IF( IFACT.EQ.2 )
-     $               CALL ZLASET( UPLO, N, N, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), AFAC, LDA )
-                  CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), X, LDA )
+     $               CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), AFAC, LDA )
+                  CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), X, LDA )
 *
 *                 Solve the system and compute the condition number
 *                 and error bounds using ZHESVXX.
diff --git a/lapack-netlib/TESTING/LIN/zdrvpox.f b/lapack-netlib/TESTING/LIN/zdrvpox.f
index 260d8c1f2..0bc2c89d8 100644
--- a/lapack-netlib/TESTING/LIN/zdrvpox.f
+++ b/lapack-netlib/TESTING/LIN/zdrvpox.f
@@ -611,10 +611,10 @@
                      CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA )
 
                      IF( .NOT.PREFAC )
-     $                  CALL ZLASET( UPLO, N, N, CMPLX( ZERO ),
-     $                               CMPLX( ZERO ), AFAC, LDA )
-                     CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ),
-     $                            CMPLX( ZERO ), X, LDA )
+     $                  CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ),
+     $                               DCMPLX( ZERO ), AFAC, LDA )
+                     CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                            DCMPLX( ZERO ), X, LDA )
                      IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN
 *
 *                       Equilibrate the matrix if FACT='F' and
diff --git a/lapack-netlib/TESTING/LIN/zdrvrfp.f b/lapack-netlib/TESTING/LIN/zdrvrfp.f
index c7be7da03..b299a487b 100644
--- a/lapack-netlib/TESTING/LIN/zdrvrfp.f
+++ b/lapack-netlib/TESTING/LIN/zdrvrfp.f
@@ -450,7 +450,7 @@
 *
                         CALL ZPOTRI( UPLO, N, A, LDA, INFO )
 
-      					IF ( N .NE. 0 ) THEN
+                        IF ( N .NE. 0 ) THEN
 *
 *                          Compute the 1-norm condition number of A.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvsyx.f b/lapack-netlib/TESTING/LIN/zdrvsyx.f
index 9431cd692..e4556f150 100644
--- a/lapack-netlib/TESTING/LIN/zdrvsyx.f
+++ b/lapack-netlib/TESTING/LIN/zdrvsyx.f
@@ -605,10 +605,10 @@
 *                 Restore the matrices A and B.
 *
                   IF( IFACT.EQ.2 )
-     $               CALL ZLASET( UPLO, N, N, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), AFAC, LDA )
-                  CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ),
-     $                 CMPLX( ZERO ), X, LDA )
+     $               CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), AFAC, LDA )
+                  CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ),
+     $                 DCMPLX( ZERO ), X, LDA )
 *
 *                 Solve the system and compute the condition number
 *                 and error bounds using ZSYSVXX.
diff --git a/lapack-netlib/TESTING/LIN/zerrvxx.f b/lapack-netlib/TESTING/LIN/zerrvxx.f
index 9dc008215..bdaf44d8a 100644
--- a/lapack-netlib/TESTING/LIN/zerrvxx.f
+++ b/lapack-netlib/TESTING/LIN/zerrvxx.f
@@ -1166,7 +1166,7 @@
      $        2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N,
      $        ERR_BNDS_C, NPARAMS, PARAMS, W, RW, INFO )
          CALL CHKXER( 'ZSYSVXX', INFOT, NOUT, LERR, OK )
-	     INFOT = 13
+         INFOT = 13
          EQ = 'N'
          CALL ZSYSVXX( 'N', 'U', 2, 0, A, 2, AF, 2, IP, EQ, R, B, 1, X,
      $        2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N,
diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f
index 9fb3bf352..b7590a8ea 100644
--- a/lapack-netlib/TESTING/LIN/zunhr_col01.f
+++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR.
-*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR
+*> ZUNHR_COL01 tests ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
 *> have to be tested before this test.
 *>
 *> \endverbatim
@@ -62,14 +62,46 @@
 *> \verbatim
 *>          RESULT is DOUBLE PRECISION array, dimension (6)
 *>          Results of each of the six tests below.
-*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
 *>
-*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
-*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
-*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
-*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
-*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
-*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using ZGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using ZGEMM.
 *> \endverbatim
 *
 *  Authors:
@@ -80,18 +112,15 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
-*
 *> \ingroup complex16_lin
 *
 *  =====================================================================
       SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
       IMPLICIT NONE
 *
-*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK test routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           M, N, MB1, NB1, NB2
@@ -102,7 +131,7 @@
 *
 *     ..
 *     .. Local allocatable arrays
-      COMPLEX*16, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+      COMPLEX*16      , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
      $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
      $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
       DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
@@ -218,7 +247,7 @@
 *     Copy the factor R into the array R.
 *
       SRNAMT = 'ZLACPY'
-      CALL ZLACPY( 'U', M, N, AF, M, R, M )
+      CALL ZLACPY( 'U', N, N, AF, M, R, M )
 *
 *     Reconstruct the orthogonal matrix Q.
 *
@@ -240,7 +269,7 @@
 *     matrix S.
 *
       SRNAMT = 'ZLACPY'
-      CALL ZLACPY( 'U', M, N, R, M, AF, M )
+      CALL ZLACPY( 'U', N, N, R, M, AF, M )
 *
       DO I = 1, N
          IF( DIAG( I ).EQ.-CONE ) THEN
diff --git a/lapack-netlib/TESTING/LIN/zunhr_col02.f b/lapack-netlib/TESTING/LIN/zunhr_col02.f
new file mode 100644
index 000000000..c6e7f80cd
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zunhr_col02.f
@@ -0,0 +1,381 @@
+*> \brief \b ZUNHR_COL02
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       DOUBLE PRECISION  RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZUNHR_COL02 tests ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT
+*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR)
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is DOUBLE PRECISION array, dimension (6)
+*>          Results of each of the six tests below.
+*>
+*>            A is a m-by-n test input matrix to be factored.
+*>            so that A = Q_gr * ( R )
+*>                               ( 0 ),
+*>
+*>            Q_qr is an implicit m-by-m unitary Q matrix, the result
+*>            of factorization in blocked WY-representation,
+*>            stored in ZGEQRT output format.
+*>
+*>            R is a n-by-n upper-triangular matrix,
+*>
+*>            0 is a (m-n)-by-n zero matrix,
+*>
+*>            Q is an explicit m-by-m unitary matrix Q = Q_gr * I
+*>
+*>            C is an m-by-n random matrix,
+*>
+*>            D is an n-by-m random matrix.
+*>
+*>          The six tests are:
+*>
+*>          RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| )
+*>            is equivalent to test for | A - Q * R | / (eps * m * |A|),
+*>
+*>          RESULT(2) = |I - (Q**H) * Q| / ( eps * m ),
+*>
+*>          RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|),
+*>
+*>          RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>
+*>          RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|)
+*>
+*>          RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|),
+*>
+*>          where:
+*>            Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are
+*>            computed using ZGEMQRT,
+*>
+*>            Q * C, (Q**H) * C, D * Q, D * (Q**H)  are
+*>            computed using ZGEMM.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      DOUBLE PRECISION  RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      COMPLEX*16      , ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+      DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D+0 )
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, J, K, L, LWORK, NB2_UB, NRB
+      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      COMPLEX*16         WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANSY
+      EXTERNAL           DLAMCH, ZLANGE, ZLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLACPY, ZLARNV, ZLASET, ZGETSQRHRT,
+     $                   ZSCAL, ZGEMM, ZGEMQRT, ZHERK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = DLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL ZLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL ZLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in ZLATSQR
+*
+      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     ZGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+*
+      CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORKQUERY, -1, INFO )
+*
+      LWORK = INT( WORKQUERY( 1 ) )
+*
+*     In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'ZGETSQRHRT'
+      CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2,
+     $                 WORK, LWORK, INFO )
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M )
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M )
+*
+      CALL ZLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
+*
+      ANORM = ZLANGE( '1', M, N, A, M, RWORK )
+      RESID = ZLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M )
+      CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
+      RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL ZLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = ZLANGE( '1', M, N, C, M, RWORK )
+      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL ZLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = ZLANGE( '1', N, M, D, N, RWORK )
+      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of ZUNHR_COL02
+*
+      END
diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile
index e21ebd6c3..0b94e3aaa 100644
--- a/lapack-netlib/TESTING/MATGEN/Makefile
+++ b/lapack-netlib/TESTING/MATGEN/Makefile
@@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \
 endif
 
 .PHONY: all
+.NOTPARALLEL:
 all: $(TMGLIB)
 
 ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \
diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c
index fc410b0e7..fed5c1de5 100644
--- a/lapack/getrf/getrf_parallel.c
+++ b/lapack/getrf/getrf_parallel.c
@@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 
     blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
 		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
-		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
+		       ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads);
 
     is += bk;
   }
diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c
index 88648cf29..556889291 100644
--- a/lapack/laswp/generic/laswp_k_1.c
+++ b/lapack/laswp/generic/laswp_k_1.c
@@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c
index 93b9a2c01..f76cd078f 100644
--- a/lapack/laswp/generic/laswp_k_2.c
+++ b/lapack/laswp/generic/laswp_k_2.c
@@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c
index 191a229a9..6520ed799 100644
--- a/lapack/laswp/generic/laswp_k_4.c
+++ b/lapack/laswp/generic/laswp_k_4.c
@@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c
index 947941839..a7bf06817 100644
--- a/lapack/laswp/generic/laswp_k_8.c
+++ b/lapack/laswp/generic/laswp_k_8.c
@@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
   a--;
   k1 --;
 
-#ifndef MINUS
   ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c
index d1204778a..42aaed528 100644
--- a/lapack/laswp/generic/zlaswp_k_1.c
+++ b/lapack/laswp/generic/zlaswp_k_1.c
@@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
   lda *= 2;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c
index c18ab4bee..1220870f8 100644
--- a/lapack/laswp/generic/zlaswp_k_2.c
+++ b/lapack/laswp/generic/zlaswp_k_2.c
@@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
   lda *= 2;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c
index 45e1bf01e..cc7e296e1 100644
--- a/lapack/laswp/generic/zlaswp_k_4.c
+++ b/lapack/laswp/generic/zlaswp_k_4.c
@@ -69,10 +69,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
   lda *= 2;
   k1 --;
 
-#ifndef MINUS
  ipiv += k1;
-#else
-  ipiv -= (k2 - 1) * incx;
+#ifdef MINUS
+  ipiv -= (k2 - k1 - 1) * incx;
 #endif
 
   if (n  <= 0) return 0;
diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile
new file mode 100644
index 000000000..71e5a87cb
--- /dev/null
+++ b/lapack/laswp/loongarch64/Makefile
@@ -0,0 +1,12 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c
index 0ebe3f069..1b32e4519 100644
--- a/lapack/lauum/lauum_L_parallel.c
+++ b/lapack/lauum/lauum_L_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO,
-		&newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = i;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (i          ) * COMPSIZE;
 
     gemm_thread_n(mode | BLAS_TRANSA_T,
-		  &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;
diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c
index 7214c9731..f5ea54c88 100644
--- a/lapack/lauum/lauum_U_parallel.c
+++ b/lapack/lauum/lauum_U_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-		&newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads);
 
     newarg.m = i;
     newarg.n = bk;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (    i * lda) * COMPSIZE;
 
     gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE,
-		  &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;
diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c
index 68ec8e22a..986816d1a 100644
--- a/lapack/potrf/potrf_L_parallel.c
+++ b/lapack/potrf/potrf_L_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + bk + i * lda) * COMPSIZE;
 
       gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
-		    &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
 #else
       syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
-		  &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads);
 #endif
     }
   }
diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c
index 3b5d39511..cc6ff9912 100644
--- a/lapack/potrf/potrf_U_parallel.c
+++ b/lapack/potrf/potrf_U_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
 
       gemm_thread_n(mode | BLAS_TRANSA_T,
-		    &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
@@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0);
 #else
       syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-		  &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads);
 #endif
     }
   }
diff --git a/openblas_config_template.h b/openblas_config_template.h
index 858b8c5cb..6a7382108 100644
--- a/openblas_config_template.h
+++ b/openblas_config_template.h
@@ -99,5 +99,8 @@ typedef int blasint;
 
 /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */
 #ifdef OPENBLAS_OS_LINUX
+#ifndef _GNU_SOURCE
+ #define _GNU_SOURCE
+#endif
 #include <sched.h>
 #endif
diff --git a/param.h b/param.h
index a0d45c573..8649e4486 100644
--- a/param.h
+++ b/param.h
@@ -72,6 +72,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef PARAM_H
 #define PARAM_H
 
+
 #define SBGEMM_DEFAULT_UNROLL_N 4
 #define SBGEMM_DEFAULT_UNROLL_M 8
 #define SBGEMM_DEFAULT_UNROLL_MN 32
@@ -85,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 256
-#define GEMM_DEFAULT_ALIGN 0x01ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_N 4
@@ -157,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 832
-#define GEMM_DEFAULT_ALIGN 0x0fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
 
 #define SGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_N 4
@@ -237,7 +238,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 832
-#define GEMM_DEFAULT_ALIGN 0x0fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
 
 
 
@@ -330,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 832
-#define GEMM_DEFAULT_ALIGN 0x0fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
 
 
 
@@ -422,7 +423,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 832
-#define GEMM_DEFAULT_ALIGN 0x0fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
 
 
 
@@ -515,7 +516,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 832
-#define GEMM_DEFAULT_ALIGN 0x0fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
 
 
 
@@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A     0
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P  8
 
@@ -644,9 +645,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
-
+/*
 #define SGEMM_DEFAULT_UNROLL_MN 32
 #define DGEMM_DEFAULT_UNROLL_MN 32
+*/
 #endif
 
 #ifdef ARCH_X86
@@ -725,7 +727,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A   0
 #define GEMM_DEFAULT_OFFSET_B 384
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_N 4
@@ -773,7 +775,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A   0
 #define GEMM_DEFAULT_OFFSET_B 256
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_N 4
@@ -820,7 +822,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A  64
 #define GEMM_DEFAULT_OFFSET_B 256
-#define GEMM_DEFAULT_ALIGN 0x01ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL
 
 #ifdef ARCH_X86
 #define SGEMM_DEFAULT_UNROLL_N 4
@@ -889,7 +891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #ifdef HAVE_SSE
 #define SGEMM_DEFAULT_UNROLL_M 8
@@ -944,7 +946,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #ifdef CORE_YONAH
 #define SGEMM_DEFAULT_UNROLL_M 4
@@ -1010,7 +1012,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_A      0
 #define GEMM_DEFAULT_OFFSET_B     32
 
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SYMV_P	8
 
@@ -1067,7 +1069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B    256
 #endif
 
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SYMV_P	8
 
@@ -1127,7 +1129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A    448
 #define GEMM_DEFAULT_OFFSET_B    128
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P	8
 
@@ -1200,7 +1202,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A   128
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P	8
 
@@ -1271,7 +1273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A   128
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P	8
 
@@ -1343,7 +1345,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A    32
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P	8
 
@@ -1416,7 +1418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A	  0
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P	8
 
@@ -1509,7 +1511,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A     0
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P  8
 
@@ -1552,9 +1554,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
-
+/*
 #define SGEMM_DEFAULT_UNROLL_MN 32
 #define DGEMM_DEFAULT_UNROLL_MN 32
+*/
 #endif
 
 #ifdef ARCH_X86
@@ -1634,7 +1637,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A     0
 #define GEMM_DEFAULT_OFFSET_B     0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SYMV_P  8
 
@@ -1666,14 +1669,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_UNROLL_M 16
+#ifdef DYNAMIC_ARCH
+#define DGEMM_DEFAULT_UNROLL_M 4
+#else
 #define DGEMM_DEFAULT_UNROLL_M 16
+#endif
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_M 4
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_UNROLL_N 4
+#ifdef DYNAMIC_ARCH
+#define DGEMM_DEFAULT_UNROLL_N 8
+#else
 #define DGEMM_DEFAULT_UNROLL_N 2
+#endif
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
@@ -1707,17 +1718,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_P 448
+#ifndef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_P 192
+#else
+#define DGEMM_DEFAULT_P 384
+#endif
 #define CGEMM_DEFAULT_P 384
 #define ZGEMM_DEFAULT_P 256
 
 #define SGEMM_DEFAULT_Q 448
+#ifndef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_Q 384
+#else
+#define DGEMM_DEFAULT_Q 168
+#endif
 #define CGEMM_DEFAULT_Q 192
 #define ZGEMM_DEFAULT_Q 128
 
 #define SGEMM_DEFAULT_R sgemm_r
+#ifndef DYNAMIC_ARCH
 #define DGEMM_DEFAULT_R 8640
+#else
+#define DGEMM_DEFAULT_R 13824
+#endif
 #define CGEMM_DEFAULT_R cgemm_r
 #define ZGEMM_DEFAULT_R zgemm_r
 
@@ -1748,6 +1771,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#ifdef SAPPHIRERAPIDS
+
+#define SNUMOPT         16
+#define DNUMOPT         8
+
+#define GEMM_DEFAULT_OFFSET_A     0
+#define GEMM_DEFAULT_OFFSET_B     0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SYMV_P  8
+
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO           8
+#define GEMM_PREFERED_SIZE     8
+#else
+#define SWITCH_RATIO           16
+#define GEMM_PREFERED_SIZE     16
+#endif
+#define USE_SGEMM_KERNEL_DIRECT 1
+
+#undef SBGEMM_DEFAULT_UNROLL_N
+#undef SBGEMM_DEFAULT_UNROLL_M
+#undef SBGEMM_DEFAULT_P
+#undef SBGEMM_DEFAULT_R
+#undef SBGEMM_DEFAULT_Q
+// FIXME: actually UNROLL_M = UNROLL_N = 16
+// If M and N is equal, OpenBLAS will reuse OCOPY as ICOPY.
+// But for AMX, they are not the same, set UNROLL_M = 32 to workaround
+#define SBGEMM_DEFAULT_UNROLL_N 16
+#define SBGEMM_DEFAULT_UNROLL_M 32
+#define SBGEMM_DEFAULT_P 256
+#define SBGEMM_DEFAULT_Q 1024
+#define SBGEMM_DEFAULT_R sbgemm_r
+
+#ifdef ARCH_X86
+
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#else
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 2
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#define SGEMM_DEFAULT_UNROLL_MN 32
+#define DGEMM_DEFAULT_UNROLL_MN 32
+#endif
+
+#ifdef ARCH_X86
+
+#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_R dgemm_r
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define CGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_R 1024
+#define ZGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define SGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 256
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 192
+#define XGEMM_DEFAULT_Q 128
+
+#else
+
+#define SGEMM_DEFAULT_P 640
+#define DGEMM_DEFAULT_P 192
+#define CGEMM_DEFAULT_P 384
+#define ZGEMM_DEFAULT_P 256
+
+#define SGEMM_DEFAULT_Q 320
+#define DGEMM_DEFAULT_Q 384
+#define CGEMM_DEFAULT_Q 192
+#define ZGEMM_DEFAULT_Q 128
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R 8640
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+
+#define QGEMM_DEFAULT_Q 128
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define XGEMM_DEFAULT_Q 128
+
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
+
+#define CGEMM3M_DEFAULT_P 320
+#define ZGEMM3M_DEFAULT_P 256
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 320
+#define ZGEMM3M_DEFAULT_Q 256
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
+#endif
+#endif
+
 #ifdef COOPERLAKE
 
 #define SNUMOPT         16
@@ -1768,6 +1924,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #define USE_SGEMM_KERNEL_DIRECT 1
 
+#undef SBGEMM_DEFAULT_UNROLL_N
+#undef SBGEMM_DEFAULT_UNROLL_M
+#undef SBGEMM_DEFAULT_P
+#undef SBGEMM_DEFAULT_R
+#undef SBGEMM_DEFAULT_Q
+#define SBGEMM_DEFAULT_UNROLL_N 4
+#define SBGEMM_DEFAULT_UNROLL_M 16
+#define SBGEMM_DEFAULT_P 384
+#define SBGEMM_DEFAULT_Q 768
+#define SBGEMM_DEFAULT_R sbgemm_r
+
 #ifdef ARCH_X86
 
 #define SGEMM_DEFAULT_UNROLL_M 4
@@ -1875,7 +2042,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A     64
 #define GEMM_DEFAULT_OFFSET_B      0
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SYMV_P	8
 
@@ -1937,7 +2104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 128
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M 8
 #define SGEMM_DEFAULT_UNROLL_N 8
@@ -1991,7 +2158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 512
 #define GEMM_DEFAULT_OFFSET_B 512
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_M 4
 #define SGEMM_DEFAULT_UNROLL_N 4
@@ -2059,7 +2226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 8192
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 4
@@ -2261,6 +2428,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_Q 216
 #define DGEMM_DEFAULT_R 1012
 
+#define CGEMM_DEFAULT_P 256
+#define CGEMM_DEFAULT_Q 104
+#define CGEMM_DEFAULT_R 1012
+   
 #define ZGEMM_DEFAULT_P 256
 #define ZGEMM_DEFAULT_Q 104
 #define ZGEMM_DEFAULT_R 1012
@@ -2278,6 +2449,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_P 144
 #define ZGEMM_DEFAULT_P 144
 #endif
+
+#define SGEMM_DEFAULT_Q 256
+#define CGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 256
 #endif
 
 #if defined(POWER5)
@@ -2342,6 +2518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0 
 #define GEMM_DEFAULT_OFFSET_B 65536
+
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 #if defined(__32BIT__)
 #warning using BINARY32==POWER6
@@ -2397,6 +2574,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 65536
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
+#define SWITCH_RATIO            16
+#define GEMM_PREFERED_SIZE      16
+
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
 #define DGEMM_DEFAULT_UNROLL_M 16
@@ -2433,24 +2613,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 65536
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
+#define SWITCH_RATIO            16
+#define GEMM_PREFERED_SIZE      16
+
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
+#if defined(HAVE_GAS) && (HAVE_GAS == 1) 
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_N 4
+#else
 #define DGEMM_DEFAULT_UNROLL_M 8
 #define DGEMM_DEFAULT_UNROLL_N 8
+#endif
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P 832
-#define DGEMM_DEFAULT_P 320
-#define CGEMM_DEFAULT_P  512
+#define SGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_P 384
+#define CGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_P 256
 
-#define SGEMM_DEFAULT_Q 1026
-#define DGEMM_DEFAULT_Q 960
-#define CGEMM_DEFAULT_Q  1026
-#define ZGEMM_DEFAULT_Q 1026
+#define SGEMM_DEFAULT_Q 512
+#define DGEMM_DEFAULT_Q 512
+#define CGEMM_DEFAULT_Q 384
+#define ZGEMM_DEFAULT_Q 384 
 
 #define SGEMM_DEFAULT_R 4096
 #define DGEMM_DEFAULT_R 4096
@@ -2541,7 +2729,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  8
@@ -2570,15 +2758,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-#ifdef LOONGSON3A
-/*Copy from SICORTEX*/
+#if defined(LOONGSON3R4)
 #define SNUMOPT		2
 #define DNUMOPT		2
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
+
+#ifdef HAVE_MSA
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
 
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#else
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -2590,6 +2790,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#endif
 
 #define SGEMM_DEFAULT_P	64
 #define DGEMM_DEFAULT_P	44
@@ -2612,40 +2813,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-#ifdef LOONGSON3B
+#if defined(LOONGSON3R3)
+////Copy from SICORTEX
 #define SNUMOPT		2
 #define DNUMOPT		2
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  4
 
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  2
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_M  4
 #define CGEMM_DEFAULT_UNROLL_N  2
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
 
 #define SGEMM_DEFAULT_P	64
-#define DGEMM_DEFAULT_P	24
-#define CGEMM_DEFAULT_P 24
-#define ZGEMM_DEFAULT_P 20
+#define DGEMM_DEFAULT_P	44
+#define CGEMM_DEFAULT_P 64
+#define ZGEMM_DEFAULT_P 32
 
 #define SGEMM_DEFAULT_Q 192
-#define DGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 92
 #define CGEMM_DEFAULT_Q 128
-#define ZGEMM_DEFAULT_Q 64
+#define ZGEMM_DEFAULT_Q 80
 
-#define SGEMM_DEFAULT_R 512
-#define DGEMM_DEFAULT_R 512
-#define CGEMM_DEFAULT_R 512
-#define ZGEMM_DEFAULT_R 512
+#define SGEMM_DEFAULT_R 640
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R 640
+#define ZGEMM_DEFAULT_R 640
 
 #define GEMM_OFFSET_A1	0x10000
 #define	GEMM_OFFSET_B1	0x100000
@@ -2653,15 +2855,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
+#if defined (LOONGSON3R5)
+#define SNUMOPT         2
+#define DNUMOPT         2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 1
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_P sgemm_p
+#define DGEMM_DEFAULT_P 32
+#define QGEMM_DEFAULT_P qgemm_p
+#define CGEMM_DEFAULT_P cgemm_p
+#define ZGEMM_DEFAULT_P zgemm_p
+#define XGEMM_DEFAULT_P xgemm_p
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R 858
+#define QGEMM_DEFAULT_R qgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 152
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 128
+#define XGEMM_DEFAULT_Q 128
+
+#define SYMV_P  16
+#endif
+
 #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500)
 #define SNUMOPT  2
 #define DNUMOPT  2
 
 #define GEMM_DEFAULT_OFFSET_A  0
 #define GEMM_DEFAULT_OFFSET_B  0
-#define GEMM_DEFAULT_ALIGN  0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL
 
-#ifdef HAVE_MSA
+#if defined(HAVE_MSA) && !defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  8
 
@@ -2708,7 +2956,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef RISCV64_GENERIC
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
@@ -2789,7 +3037,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2830,7 +3078,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  2
@@ -2871,13 +3119,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
+#ifdef _WIN64
+/* Use explicit casting for win64 as LLP64 datamodel is used */
+#define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL
+#else
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
+#endif
 
 #define SYMV_P	16
 
 #if defined(CORTEXA57) || \
     defined(CORTEXA72) || defined(CORTEXA73) || \
-    defined(FALKOR)    || defined(TSV110) || defined(EMAG8180)
+    defined(FALKOR)    || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2894,7 +3147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /*FIXME: this should be using the cache size, but there is currently no easy way to
 query that on ARM. So if getarch counted more than 8 cores we simply assume the host
 is a big desktop or server with abundant cache rather than a phone or embedded device */ 
-#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180)
+#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)
   #define SGEMM_DEFAULT_P 512
   #define DGEMM_DEFAULT_P 256
   #define CGEMM_DEFAULT_P 256
@@ -2921,12 +3174,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 2048
 
-#elif defined(CORTEXA53)
+#elif defined(CORTEXA53) || defined(CORTEXA55)
 
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_M  4
 #define DGEMM_DEFAULT_UNROLL_N  4
 
 #define CGEMM_DEFAULT_UNROLL_M  8
@@ -3066,7 +3319,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#else /* Other/undetected ARMv8 cores */
+#elif defined(NEOVERSEV1)
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -3080,6 +3333,105 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#elif defined(NEOVERSEN2)
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#elif defined(ARMV8SVE) || defined(A64FX)
+
+/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
+Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
+#define SGEMM_DEFAULT_UNROLL_M  4
+#define SGEMM_DEFAULT_UNROLL_N  8
+/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
+ * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
+ * If SVE size is ever more than 1024, this should be increased also. */
+#define SGEMM_DEFAULT_UNROLL_MN  32
+
+/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
+Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
+#define DGEMM_DEFAULT_UNROLL_M  2 
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_MN  32
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN  16
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN  16
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#else /* Other/undetected ARMv8 cores */
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	160
 #define CGEMM_DEFAULT_P 128
@@ -3097,6 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #endif /* Cores */
 
+
 #endif /* ARMv8 */
 
 #if defined(ARMV5)
@@ -3105,7 +3458,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
@@ -3146,7 +3499,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -3187,7 +3540,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -3228,7 +3581,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
@@ -3267,7 +3620,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -3349,7 +3702,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_N 2
 #define DGEMM_DEFAULT_UNROLL_N 2
@@ -3374,6 +3727,20 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define XGEMM_DEFAULT_UNROLL_M 1
 #endif
 
+#ifdef ARCH_MIPS
+#define SGEMM_DEFAULT_P  128
+#define DGEMM_DEFAULT_P  128
+#define CGEMM_DEFAULT_P  96
+#define ZGEMM_DEFAULT_P  64
+#define SGEMM_DEFAULT_Q  240
+#define DGEMM_DEFAULT_Q  120
+#define CGEMM_DEFAULT_Q  120
+#define ZGEMM_DEFAULT_Q  120
+#define SGEMM_DEFAULT_R  12288
+#define DGEMM_DEFAULT_R  8192
+#define CGEMM_DEFAULT_R  4096
+#define ZGEMM_DEFAULT_R  4096
+#else
 #define SGEMM_DEFAULT_P sgemm_p
 #define DGEMM_DEFAULT_P dgemm_p
 #define QGEMM_DEFAULT_P qgemm_p
@@ -3394,6 +3761,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 128
 #define XGEMM_DEFAULT_Q 128
+#endif
 
 #define SYMV_P	16
 
diff --git a/relapack/config.h b/relapack/config.h
index e4fab0a12..9d6919463 100644
--- a/relapack/config.h
+++ b/relapack/config.h
@@ -115,7 +115,7 @@
 #define INCLUDE_CTGSYL INCLUDE_XTGSYL
 #define INCLUDE_ZTGSYL INCLUDE_XTGSYL
 
-#define INCLUDE_XGEMMT 0
+#define INCLUDE_XGEMMT 1
 #define INCLUDE_SGEMMT INCLUDE_XGEMMT
 #define INCLUDE_DGEMMT INCLUDE_XGEMMT
 #define INCLUDE_CGEMMT INCLUDE_XGEMMT
diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c
index 0252f3d92..fc3dbc11e 100644
--- a/relapack/src/lapack_wrappers.c
+++ b/relapack/src/lapack_wrappers.c
@@ -566,7 +566,8 @@ void LAPACK(sgemmt)(
     const float *B, const blasint *ldB,
     const float *beta, float *C, const blasint *ldC
 ) {
-    RELAPACK_sgemmt(uplo, n, A, ldA, info);
+	blasint info;
+    RELAPACK_sgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
 }
 #endif
 
@@ -578,7 +579,8 @@ void LAPACK(dgemmt)(
     const double *B, const blasint *ldB,
     const double *beta, double *C, const blasint *ldC
 ) {
-    RELAPACK_dgemmt(uplo, n, A, ldA, info);
+	blasint info;
+    RELAPACK_dgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
 }
 #endif
 
@@ -590,7 +592,8 @@ void LAPACK(cgemmt)(
     const float *B, const blasint *ldB,
     const float *beta, float *C, const blasint *ldC
 ) {
-    RELAPACK_cgemmt(uplo, n, A, ldA, info);
+	blasint info;
+    RELAPACK_cgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
 }
 #endif
 
@@ -602,6 +605,7 @@ void LAPACK(zgemmt)(
     const double *B, const blasint *ldB,
     const double *beta, double *C, const blasint *ldC
 ) {
-    RELAPACK_zgemmt(uplo, n, A, ldA, info);
+	blasint info;
+    RELAPACK_zgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info);
 }
 #endif
diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig
deleted file mode 100644
index d89d2fe2f..000000000
--- a/relapack/src/lapack_wrappers.c.orig
+++ /dev/null
@@ -1,607 +0,0 @@
-#include "relapack.h"
-
-////////////
-// XLAUUM //
-////////////
-
-#if INCLUDE_SLAUUM
-void LAPACK(slauum)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_slauum(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_DLAUUM
-void LAPACK(dlauum)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_dlauum(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_CLAUUM
-void LAPACK(clauum)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_clauum(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_ZLAUUM
-void LAPACK(zlauum)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_zlauum(uplo, n, A, ldA, info);
-}
-#endif
-
-
-////////////
-// XSYGST //
-////////////
-
-#if INCLUDE_SSYGST
-void LAPACK(ssygst)(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    int *info
-) {
-    RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
-}
-#endif
-
-#if INCLUDE_DSYGST
-void LAPACK(dsygst)(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    int *info
-) {
-    RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
-}
-#endif
-
-#if INCLUDE_CSYGST
-void LAPACK(csygst)(
-    const int *itype, const char *uplo, const int *n,
-    float *A, const int *ldA, const float *B, const int *ldB,
-    int *info
-) {
-    RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info);
-}
-#endif
-
-#if INCLUDE_ZSYGST
-void LAPACK(zsygst)(
-    const int *itype, const char *uplo, const int *n,
-    double *A, const int *ldA, const double *B, const int *ldB,
-    int *info
-) {
-    RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info);
-}
-#endif
-
-
-////////////
-// XTRTRI //
-////////////
-
-#if INCLUDE_STRTRI
-void LAPACK(strtri)(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_strtri(uplo, diag, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_DTRTRI
-void LAPACK(dtrtri)(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_CTRTRI
-void LAPACK(ctrtri)(
-    const char *uplo, const char *diag, const int *n,
-    float *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_ZTRTRI
-void LAPACK(ztrtri)(
-    const char *uplo, const char *diag, const int *n,
-    double *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
-}
-#endif
-
-
-////////////
-// XPOTRF //
-////////////
-
-#if INCLUDE_SPOTRF
-void LAPACK(spotrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_spotrf(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_DPOTRF
-void LAPACK(dpotrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_dpotrf(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_CPOTRF
-void LAPACK(cpotrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_cpotrf(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_ZPOTRF
-void LAPACK(zpotrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA,
-    int *info
-) {
-    RELAPACK_zpotrf(uplo, n, A, ldA, info);
-}
-#endif
-
-
-////////////
-// XPBTRF //
-////////////
-
-#if INCLUDE_SPBTRF
-void LAPACK(spbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    int *info
-) {
-    RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
-}
-#endif
-
-#if INCLUDE_DPBTRF
-void LAPACK(dpbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    int *info
-) {
-    RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
-}
-#endif
-
-#if INCLUDE_CPBTRF
-void LAPACK(cpbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    float *Ab, const int *ldAb,
-    int *info
-) {
-    RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
-}
-#endif
-
-#if INCLUDE_ZPBTRF
-void LAPACK(zpbtrf)(
-    const char *uplo, const int *n, const int *kd,
-    double *Ab, const int *ldAb,
-    int *info
-) {
-    RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
-}
-#endif
-
-
-////////////
-// XSYTRF //
-////////////
-
-#if INCLUDE_SSYTRF
-void LAPACK(ssytrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
-) {
-    RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_DSYTRF
-void LAPACK(dsytrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
-) {
-    RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_CSYTRF
-void LAPACK(csytrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
-) {
-    RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_ZSYTRF
-void LAPACK(zsytrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
-) {
-    RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_CHETRF
-void LAPACK(chetrf)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
-) {
-    RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_ZHETRF
-void LAPACK(zhetrf)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
-) {
-    RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_SSYTRF_ROOK
-void LAPACK(ssytrf_rook)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
-) {
-    RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_DSYTRF_ROOK
-void LAPACK(dsytrf_rook)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
-) {
-    RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_CSYTRF_ROOK
-void LAPACK(csytrf_rook)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
-) {
-    RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_ZSYTRF_ROOK
-void LAPACK(zsytrf_rook)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
-) {
-    RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_CHETRF_ROOK
-void LAPACK(chetrf_rook)(
-    const char *uplo, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    float *Work, const int *lWork, int *info
-) {
-    RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-#if INCLUDE_ZHETRF_ROOK
-void LAPACK(zhetrf_rook)(
-    const char *uplo, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    double *Work, const int *lWork, int *info
-) {
-    RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
-}
-#endif
-
-
-////////////
-// XGETRF //
-////////////
-
-#if INCLUDE_SGETRF
-void LAPACK(sgetrf)(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
-) {
-    RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
-}
-#endif
-
-#if INCLUDE_DGETRF
-void LAPACK(dgetrf)(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
-) {
-    RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
-}
-#endif
-
-#if INCLUDE_CGETRF
-void LAPACK(cgetrf)(
-    const int *m, const int *n,
-    float *A, const int *ldA, int *ipiv,
-    int *info
-) {
-    RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
-}
-#endif
-
-#if INCLUDE_ZGETRF
-void LAPACK(zgetrf)(
-    const int *m, const int *n,
-    double *A, const int *ldA, int *ipiv,
-    int *info
-) {
-    RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
-}
-#endif
-
-
-////////////
-// XGBTRF //
-////////////
-
-#if INCLUDE_SGBTRF
-void LAPACK(sgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    int *info
-) {
-    RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
-}
-#endif
-
-#if INCLUDE_DGBTRF
-void LAPACK(dgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    int *info
-) {
-    RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
-}
-#endif
-
-#if INCLUDE_CGBTRF
-void LAPACK(cgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    float *Ab, const int *ldAb, int *ipiv,
-    int *info
-) {
-    RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
-}
-#endif
-
-#if INCLUDE_ZGBTRF
-void LAPACK(zgbtrf)(
-    const int *m, const int *n, const int *kl, const int *ku,
-    double *Ab, const int *ldAb, int *ipiv,
-    int *info
-) {
-    RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
-}
-#endif
-
-
-////////////
-// XTRSYL //
-////////////
-
-#if INCLUDE_STRSYL
-void LAPACK(strsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
-) {
-    RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
-}
-#endif
-
-#if INCLUDE_DTRSYL
-void LAPACK(dtrsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
-) {
-    RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
-}
-#endif
-
-#if INCLUDE_CTRSYL
-void LAPACK(ctrsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC, float *scale,
-    int *info
-) {
-    RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
-}
-#endif
-
-#if INCLUDE_ZTRSYL
-void LAPACK(ztrsyl)(
-    const char *tranA, const char *tranB, const int *isgn,
-    const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC, double *scale,
-    int *info
-) {
-    RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
-}
-#endif
-
-
-////////////
-// XTGSYL //
-////////////
-
-#if INCLUDE_STGSYL
-void LAPACK(stgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
-    float *scale, float *dif,
-    float *Work, const int *lWork, int *iWork, int *info
-) {
-    RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
-}
-#endif
-
-#if INCLUDE_DTGSYL
-void LAPACK(dtgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
-    double *scale, double *dif,
-    double *Work, const int *lWork, int *iWork, int *info
-) {
-    RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
-}
-#endif
-
-#if INCLUDE_CTGSYL
-void LAPACK(ctgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const float *A, const int *ldA, const float *B, const int *ldB,
-    float *C, const int *ldC,
-    const float *D, const int *ldD, const float *E, const int *ldE,
-    float *F, const int *ldF,
-    float *scale, float *dif,
-    float *Work, const int *lWork, int *iWork, int *info
-) {
-    RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
-}
-#endif
-
-#if INCLUDE_ZTGSYL
-void LAPACK(ztgsyl)(
-    const char *trans, const int *ijob, const int *m, const int *n,
-    const double *A, const int *ldA, const double *B, const int *ldB,
-    double *C, const int *ldC,
-    const double *D, const int *ldD, const double *E, const int *ldE,
-    double *F, const int *ldF,
-    double *scale, double *dif,
-    double *Work, const int *lWork, int *iWork, int *info
-) {
-    RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
-}
-#endif
-
-
-////////////
-// XGEMMT //
-////////////
-
-#if INCLUDE_SGEMMT
-void LAPACK(sgemmt)(
-    const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
-) {
-    RELAPACK_sgemmt(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_DGEMMT
-void LAPACK(dgemmt)(
-    const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
-) {
-    RELAPACK_dgemmt(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_CGEMMT
-void LAPACK(cgemmt)(
-    const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const float *alpha, const float *A, const int *ldA,
-    const float *B, const int *ldB,
-    const float *beta, float *C, const int *ldC
-) {
-    RELAPACK_cgemmt(uplo, n, A, ldA, info);
-}
-#endif
-
-#if INCLUDE_ZGEMMT
-void LAPACK(zgemmt)(
-    const char *uplo, const char *transA, const char *transB,
-    const int *n, const int *k,
-    const double *alpha, const double *A, const int *ldA,
-    const double *B, const int *ldB,
-    const double *beta, double *C, const int *ldC
-) {
-    RELAPACK_zgemmt(uplo, n, A, ldA, info);
-}
-#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 360ff2151..e4ee8b28b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,10 @@ include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_BINARY_DIR})
 
 enable_language(Fortran)
+if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
+        set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
+endif()
+
 
 if (BUILD_SINGLE)
 	list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3)
@@ -22,6 +26,20 @@ target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
 endforeach()
 
 # $1 exec, $2 input, $3 output_result
+if(WIN32)
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1
+"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n"
+"$ErrorActionPreference = \"Stop\"\n"
+"Get-Content $args[1] | & $args[0]\n"
+"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n"
+"echo Error\n"
+"exit 1\n"
+"} else {\n"
+"exit 0\n"
+"}\n"
+)
+set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1")
+else()
 FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
 "rm -f $3\n"
 "$1 < $2\n"
@@ -33,6 +51,8 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
 "exit 0\n"
 "fi\n"
 )
+set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh")
+endif()
 
 #set(float_types s d c z)
 if (BUILD_SINGLE)
@@ -50,9 +70,9 @@ endif()
 foreach(float_type ${float_types})
 string(TOUPPER ${float_type} float_type_upper)
 add_test(NAME "${float_type}blas1" 
-  COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1")
+  COMMAND $<TARGET_FILE:${float_type}blat1>)
 add_test(NAME "${float_type}blas2" 
-  COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
+  COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
 add_test(NAME "${float_type}blas3" 
-  COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
+  COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
 endforeach()
diff --git a/test/Makefile b/test/Makefile
index 5f653414a..923f1537c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,6 +1,8 @@
 TOPDIR	= ..
 include ../Makefile.system
-
+ifeq ($(F_COMPILER),GFORTRAN)
+        override FFLAGS += -fno-tree-vectorize
+endif
 
 ifeq ($(NOFORTRAN),1)
 all ::
@@ -259,10 +261,6 @@ endif
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 
-ifeq ($(CORE), C910V)
-EXTRALIB =
-CEXTRALIB =
-endif
 
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
@@ -270,6 +268,9 @@ ifeq ($(C_COMPILER), CLANG)
 CEXTRALIB = -lomp
 endif
 endif
+ifeq ($(F_COMPILER), NAG)
+CEXTRALIB = -lgomp
+endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
diff --git a/utest/ctest.h b/utest/ctest.h
index d316b1494..79961badf 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -28,7 +28,10 @@
 #define WEAK
 #endif
 
+#ifndef __MSC_VER
 #include <inttypes.h> /* intmax_t, uintmax_t, PRI* */
+#endif
+
 #include <stddef.h> /* size_t */
 
 typedef void (*SetupFunc)(void*);
@@ -62,9 +65,14 @@ struct ctest {
 #undef CTEST_SEGFAULT
 #endif
 
-#if defined(_WIN32) && defined(_MSC_VER)
+#if defined(_WIN32)
+#if defined(__clang__)
+#define __CTEST_NO_TIME
+#undef CTEST_SEGFAULT
+#elif defined(_MSC_VER)
 #define __CTEST_MSVC
 #endif
+#endif
 
 //config for MSVC compiler
 #ifdef __CTEST_MSVC
@@ -72,6 +80,13 @@ struct ctest {
 #define __CTEST_NO_TIME
 #define CTEST_NO_COLORS
 
+#if __MSC_VER >= 1500
+#include <inttypes.h>
+#else
+#include <stdint.h>
+#define CTEST_NO_INTTYPES
+#endif
+
 #ifndef CTEST_ADD_TESTS_MANUALLY
 #pragma section(".ctest$a")
 #pragma section(".ctest$u")
@@ -276,7 +291,7 @@ void assert_dbl_far(double exp, double real, double tol, const char* caller, int
 #endif
 #include <stdint.h>
 
-#ifdef __CTEST_MSVC
+#ifdef _WIN32
 #include <io.h>
 #else
 #include <unistd.h>
@@ -480,11 +495,19 @@ void assert_data(const unsigned char* exp, size_t expsize,
                  const char* caller, int line) {
     size_t i;
     if (expsize != realsize) {
+#ifndef CTEST_NO_INTTYPES
         CTEST_ERR("%s:%d  expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize);
+#else
+        CTEST_ERR("%s:%d  expected %u bytes, got %u", caller, line, (uintmax_t) expsize, (uintmax_t) realsize);
+#endif
     }
     for (i=0; i<expsize; i++) {
         if (exp[i] != real[i]) {
+#ifndef CTEST_NO_INTTYPES
             CTEST_ERR("%s:%d expected 0x%02x at offset %" PRIuMAX " got 0x%02x",
+#else
+            CTEST_ERR("%s:%d expected 0x%02x at offset %u got 0x%02x",
+#endif
                 caller, line, exp[i], (uintmax_t) i, real[i]);
         }
     }
@@ -492,31 +515,51 @@ void assert_data(const unsigned char* exp, size_t expsize,
 
 void assert_equal(intmax_t exp, intmax_t real, const char* caller, int line) {
     if (exp != real) {
+#ifndef CTEST_NO_INTTYPES
         CTEST_ERR("%s:%d  expected %" PRIdMAX ", got %" PRIdMAX, caller, line, exp, real);
+#else
+        CTEST_ERR("%s:%d  expected %d, got %d", caller, line, exp, real);
+#endif
     }
 }
 
 void assert_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line) {
     if (exp != real) {
+#ifndef CTEST_NO_INTTYPES
         CTEST_ERR("%s:%d  expected %" PRIuMAX ", got %" PRIuMAX, caller, line, exp, real);
+#else
+        CTEST_ERR("%s:%d  expected %u, got %u", caller, line, exp, real);
+#endif
     }
 }
 
 void assert_not_equal(intmax_t exp, intmax_t real, const char* caller, int line) {
     if ((exp) == (real)) {
+#ifndef CTEST_NO_INTTYPES
         CTEST_ERR("%s:%d  should not be %" PRIdMAX, caller, line, real);
+#else
+        CTEST_ERR("%s:%d  should not be %d", caller, line, real);
+#endif
     }
 }
 
 void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line) {
     if ((exp) == (real)) {
+#ifndef CTEST_NO_INTTYPES
         CTEST_ERR("%s:%d  should not be %" PRIuMAX, caller, line, real);
+#else
+        CTEST_ERR("%s:%d  should not be %u", caller, line, real);
+#endif
     }
 }
 
 void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line) {
     if (real < exp1 || real > exp2) {
+#ifndef CTEST_NO_INTTYPES
         CTEST_ERR("%s:%d  expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real);
+#else
+        CTEST_ERR("%s:%d  expected %d-%d, got %d", caller, line, exp1, exp2, real);
+#endif
     }
 }